From 7adacf5eb2d2048045d9fd8fdab861fd9e7e2e96 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 4 Dec 2019 15:50:27 +0100 Subject: KVM: x86: use CPUID to locate host page table reserved bits The comment in kvm_get_shadow_phys_bits refers to MKTME, but the same is actually true of SME and SEV. Just use CPUID[0x8000_0008].EAX[7:0] unconditionally if available, it is simplest and works even if memory is not encrypted. Cc: stable@vger.kernel.org Reported-by: Tom Lendacky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6f92b40d798c..1e4ee4f8de5f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -538,16 +538,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); static u8 kvm_get_shadow_phys_bits(void) { /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected - * in CPU detection code, but MKTME treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore for MKTME - * we should still return physical address bits reported by CPUID. + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID. */ - if (!boot_cpu_has(X86_FEATURE_TME) || - WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) - return boot_cpu_data.x86_phys_bits; + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; - return cpuid_eax(0x80000008) & 0xff; + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; } static void kvm_mmu_reset_all_pte_masks(void) -- cgit v1.2.3 From 4fb7b452ce7b1490e789bf6fabd4f397cf57a26f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 5 Dec 2019 10:24:38 +0800 Subject: KVM: vmx: remove unreachable statement in vmx_get_msr_feature() We have no way to reach the final statement, remove it. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e3394c839dea..5fb7a1695a24 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1773,8 +1773,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) default: return 1; } - - return 0; } /* -- cgit v1.2.3 From fe3c2b4c228443b505f2d8981c4871b96cfec6d6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 5 Dec 2019 11:40:16 +0800 Subject: KVM: explicitly set rmap_head->val to 0 in pte_list_desc_remove_entry() When we reach here, we have desc->sptes[j] = NULL with j = 0. So we can replace desc->sptes[0] with 0 to make it more clear. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1e4ee4f8de5f..35d37099bec6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1414,7 +1414,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, if (j != 0) return; if (!prev_desc && !desc->more) - rmap_head->val = (unsigned long)desc->sptes[0]; + rmap_head->val = 0; else if (prev_desc) prev_desc->more = desc->more; -- cgit v1.2.3 From b4b2963616bbd91ebb33148522552e1135de56ae Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:16 +0100 Subject: KVM: X86: Fix kvm_bitmap_or_dest_vcpus() to use irq shorthand The 3rd parameter of kvm_apic_match_dest() is the irq shorthand, rather than the irq delivery mode. Fixes: 7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs") Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cf9177b4a07f..1eabe58bb6d5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1151,7 +1151,7 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, if (!kvm_apic_present(vcpu)) continue; if (!kvm_apic_match_dest(vcpu, NULL, - irq->delivery_mode, + irq->shorthand, irq->dest_id, irq->dest_mode)) continue; -- cgit v1.2.3 From 59508b303e4e35de9dd708ec87b1e89b1f3c1616 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:17 +0100 Subject: KVM: X86: Move irrelevant declarations out of ioapic.h kvm_apic_match_dest() is declared in both ioapic.h and lapic.h. Remove the declaration in ioapic.h. kvm_apic_compare_prio() is declared in ioapic.h but defined in lapic.c. Move the declaration to lapic.h. kvm_irq_delivery_to_apic() is declared in ioapic.h but defined in irq_comm.c. Move the declaration to irq.h. hyperv.c needs to use kvm_irq_delivery_to_apic(). Include irq.h in hyperv.c. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 1 + arch/x86/kvm/ioapic.h | 6 ------ arch/x86/kvm/irq.h | 3 +++ arch/x86/kvm/lapic.h | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 23ff65504d7e..c7d4640b7b1c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -33,6 +33,7 @@ #include #include "trace.h" +#include "irq.h" #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index ea1a4e0297da..2fb2e3c80724 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -116,9 +116,6 @@ static inline int ioapic_in_kernel(struct kvm *kvm) } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); -bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, unsigned int dest, int dest_mode); -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); @@ -126,9 +123,6 @@ void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, int level, bool line_status); void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); -int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, - struct dest_map *dest_map); void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7c6233d37c64..f173ab6b407e 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -113,5 +113,8 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu); int kvm_setup_default_irq_routing(struct kvm *kvm); int kvm_setup_empty_irq_routing(struct kvm *kvm); +int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, + struct dest_map *dest_map); #endif diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 39925afdfcdc..0b9bbadd1f3c 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -83,7 +83,7 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); - +int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From c96001c5702e66b64e0ffe533aa19d6567ce15bc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:18 +0100 Subject: KVM: X86: Use APIC_DEST_* macros properly in kvm_lapic_irq.dest_mode We were using either APIC_DEST_PHYSICAL|APIC_DEST_LOGICAL or 0|1 to fill in kvm_lapic_irq.dest_mode. It's fine only because in most cases when we check against dest_mode it's against APIC_DEST_PHYSICAL (which equals to 0). However, that's not consistent. We'll have problem when we want to start checking against APIC_DEST_LOGICAL, which does not equals to 1. This patch firstly introduces kvm_lapic_irq_dest_mode() helper to take any boolean of destination mode and return the APIC_DEST_* macro. Then, it replaces the 0|1 settings of irq.dest_mode with the helper. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/ioapic.c | 9 ++++++--- arch/x86/kvm/irq_comm.c | 7 ++++--- arch/x86/kvm/x86.c | 2 +- 4 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b79cd6aa4075..2893eae5df9f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1022,6 +1022,11 @@ struct kvm_lapic_irq { bool msi_redir_hint; }; +static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) +{ + return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; +} + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 9fd2dd89a1c5..e623a4f8d27e 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -331,7 +331,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) irq.vector = e->fields.vector; irq.delivery_mode = e->fields.delivery_mode << 8; irq.dest_id = e->fields.dest_id; - irq.dest_mode = e->fields.dest_mode; + irq.dest_mode = + kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); bitmap_zero(&vcpu_bitmap, 16); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, &vcpu_bitmap); @@ -343,7 +344,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) * keep ioapic_handled_vectors synchronized. */ irq.dest_id = old_dest_id; - irq.dest_mode = old_dest_mode; + irq.dest_mode = + kvm_lapic_irq_dest_mode( + !!e->fields.dest_mode); kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, &vcpu_bitmap); } @@ -369,7 +372,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.dest_id = entry->fields.dest_id; irqe.vector = entry->fields.vector; - irqe.dest_mode = entry->fields.dest_mode; + irqe.dest_mode = kvm_lapic_irq_dest_mode(!!entry->fields.dest_mode); irqe.trig_mode = entry->fields.trig_mode; irqe.delivery_mode = entry->fields.delivery_mode << 8; irqe.level = 1; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8ecd48d31800..22108ed66a76 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -52,8 +52,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; - if (irq->dest_mode == 0 && irq->dest_id == 0xff && - kvm_lowest_prio_delivery(irq)) { + if (irq->dest_mode == APIC_DEST_PHYSICAL && + irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } @@ -114,7 +114,8 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); irq->vector = (e->msi.data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; - irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; + irq->dest_mode = kvm_lapic_irq_dest_mode( + !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo)); irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; irq->delivery_mode = e->msi.data & 0x700; irq->msi_redir_hint = ((e->msi.address_lo diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf917139de6b..f4e477e6c954 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7358,7 +7358,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) struct kvm_lapic_irq lapic_irq; lapic_irq.shorthand = 0; - lapic_irq.dest_mode = 0; + lapic_irq.dest_mode = APIC_DEST_PHYSICAL; lapic_irq.level = 0; lapic_irq.dest_id = apicid; lapic_irq.msi_redir_hint = false; -- cgit v1.2.3 From ac8ef992cd02cdb8290ca788746d283fe3092500 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:19 +0100 Subject: KVM: X86: Drop KVM_APIC_SHORT_MASK and KVM_APIC_DEST_MASK We have both APIC_SHORT_MASK and KVM_APIC_SHORT_MASK defined for the shorthand mask. Similarly, we have both APIC_DEST_MASK and KVM_APIC_DEST_MASK defined for the destination mode mask. Drop the KVM_APIC_* macros and replace the only user of them to use the APIC_DEST_* macros instead. At the meantime, move APIC_SHORT_MASK and APIC_DEST_MASK from lapic.c to lapic.h. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 --- arch/x86/kvm/lapic.h | 5 +++-- arch/x86/kvm/svm.c | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1eabe58bb6d5..805c18178bbf 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -56,9 +56,6 @@ #define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ -#define APIC_SHORT_MASK 0xc0000 -#define APIC_DEST_NOSHORT 0x0 -#define APIC_DEST_MASK 0x800 #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 0b9bbadd1f3c..5a9f29ed9a4b 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -10,8 +10,9 @@ #define KVM_APIC_SIPI 1 #define KVM_APIC_LVT_NUM 6 -#define KVM_APIC_SHORT_MASK 0xc0000 -#define KVM_APIC_DEST_MASK 0x800 +#define APIC_SHORT_MASK 0xc0000 +#define APIC_DEST_NOSHORT 0x0 +#define APIC_DEST_MASK 0x800 #define APIC_BUS_CYCLE_NS 1 #define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 122d4ce3b1ab..8f1b715dfde8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4519,9 +4519,9 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) */ kvm_for_each_vcpu(i, vcpu, kvm) { bool m = kvm_apic_match_dest(vcpu, apic, - icrl & KVM_APIC_SHORT_MASK, + icrl & APIC_SHORT_MASK, GET_APIC_DEST_FIELD(icrh), - icrl & KVM_APIC_DEST_MASK); + icrl & APIC_DEST_MASK); if (m && !avic_vcpu_is_running(vcpu)) kvm_vcpu_wake_up(vcpu); -- cgit v1.2.3 From 5c69d5c113f15a7a9956185b815d14d50f3efad4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:20 +0100 Subject: KVM: X86: Fix callers of kvm_apic_match_dest() to use correct macros Callers of kvm_apic_match_dest() should always pass in APIC_DEST_* macros for either dest_mode and short_hand parameters. Fix up all the callers of kvm_apic_match_dest() that are not following the rule. Since at it, rename the parameter from short_hand to shorthand in kvm_apic_match_dest(), as suggested by Vitaly. Reported-by: Sean Christopherson Reported-by: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 11 +++++++---- arch/x86/kvm/irq_comm.c | 3 ++- arch/x86/kvm/lapic.c | 4 ++-- arch/x86/kvm/lapic.h | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index e623a4f8d27e..f53daeaaeb37 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -108,8 +108,9 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) union kvm_ioapic_redirect_entry *e; e = &ioapic->redirtbl[RTC_GSI]; - if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id, - e->fields.dest_mode)) + if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + e->fields.dest_id, + kvm_lapic_irq_dest_mode(!!e->fields.dest_mode))) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); @@ -250,8 +251,10 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) || index == RTC_GSI) { - if (kvm_apic_match_dest(vcpu, NULL, 0, - e->fields.dest_id, e->fields.dest_mode) || + u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); + + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + e->fields.dest_id, dm) || kvm_apic_pending_eoi(vcpu, e->fields.vector)) __set_bit(e->fields.vector, ioapic_handled_vectors); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 22108ed66a76..7d083f71fc8e 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -417,7 +417,8 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm_set_msi_irq(vcpu->kvm, entry, &irq); - if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, + if (irq.level && + kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, irq.dest_id, irq.dest_mode)) __set_bit(irq.vector, ioapic_handled_vectors); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 805c18178bbf..679692b55f6d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -789,13 +789,13 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, unsigned int dest, int dest_mode) + int shorthand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; u32 mda = kvm_apic_mda(vcpu, dest, source, target); ASSERT(target); - switch (short_hand) { + switch (shorthand) { case APIC_DEST_NOSHORT: if (dest_mode == APIC_DEST_PHYSICAL) return kvm_apic_match_physical_addr(target, mda); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 5a9f29ed9a4b..ec730ce7a344 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -83,7 +83,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val); int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, unsigned int dest, int dest_mode); + int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); -- cgit v1.2.3 From 150a84fee84fbaf2a2a6c76c44ae027b5c7d151a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 4 Dec 2019 20:07:21 +0100 Subject: KVM: X86: Convert the last users of "shorthand = 0" to use macros Change the last users of "shorthand = 0" to use APIC_DEST_NOSHORT. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 4 ++-- arch/x86/kvm/irq_comm.c | 2 +- arch/x86/kvm/x86.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index f53daeaaeb37..77538fd77dc2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -330,7 +330,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.delivery_mode == APIC_DM_FIXED) { struct kvm_lapic_irq irq; - irq.shorthand = 0; + irq.shorthand = APIC_DEST_NOSHORT; irq.vector = e->fields.vector; irq.delivery_mode = e->fields.delivery_mode << 8; irq.dest_id = e->fields.dest_id; @@ -379,7 +379,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.trig_mode = entry->fields.trig_mode; irqe.delivery_mode = entry->fields.delivery_mode << 8; irqe.level = 1; - irqe.shorthand = 0; + irqe.shorthand = APIC_DEST_NOSHORT; irqe.msi_redir_hint = false; if (irqe.trig_mode == IOAPIC_EDGE_TRIG) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 7d083f71fc8e..9d711c2451c7 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -121,7 +121,7 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, irq->msi_redir_hint = ((e->msi.address_lo & MSI_ADDR_REDIRECTION_LOWPRI) > 0); irq->level = 1; - irq->shorthand = 0; + irq->shorthand = APIC_DEST_NOSHORT; } EXPORT_SYMBOL_GPL(kvm_set_msi_irq); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f4e477e6c954..16902d0aad3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7357,7 +7357,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) { struct kvm_lapic_irq lapic_irq; - lapic_irq.shorthand = 0; + lapic_irq.shorthand = APIC_DEST_NOSHORT; lapic_irq.dest_mode = APIC_DEST_PHYSICAL; lapic_irq.level = 0; lapic_irq.dest_id = apicid; -- cgit v1.2.3 From 0a03cbdac115fdcc06fd9d05ce3c389d0ead9a71 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 6 Dec 2019 16:20:18 +0800 Subject: KVM: x86: Fix some comment typos Fix some typos in comment. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 35d37099bec6..c19f3ccaace3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1529,7 +1529,7 @@ struct rmap_iterator { /* * Iteration must be started by this function. This should also be used after * removing/dropping sptes from the rmap link because in such cases the - * information in the itererator may not be valid. + * information in the iterator may not be valid. * * Returns sptep if found, NULL otherwise. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 16902d0aad3a..3051324f72d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9792,7 +9792,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * * The reason is, in case of PML, we need to set D-bit for any slots * with dirty logging disabled in order to eliminate unnecessary GPA - * logging in PML buffer (and potential PML buffer full VMEXT). This + * logging in PML buffer (and potential PML buffer full VMEXIT). This * guarantees leaving PML enabled during guest's lifetime won't have * any additional overhead from PML when guest is running with dirty * logging disabled for memory slots. -- cgit v1.2.3 From 9dadc2f918df26e64aa04794cdb4d8667c934f47 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Fri, 6 Dec 2019 16:45:24 +0800 Subject: KVM: VMX: Rename INTERRUPT_PENDING to INTERRUPT_WINDOW Rename interrupt-windown exiting related definitions to match the latest Intel SDM. No functional changes. Signed-off-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/include/uapi/asm/vmx.h | 4 ++-- arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 10 +++++----- tools/arch/x86/include/uapi/asm/vmx.h | 4 ++-- tools/testing/selftests/kvm/include/x86_64/vmx.h | 4 ++-- 6 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 1835767aa335..5acda8d9b9a7 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -19,7 +19,7 @@ /* * Definitions of Primary Processor-Based VM-Execution Controls. */ -#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 #define CPU_BASED_INVLPG_EXITING 0x00000200 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 3eb8411ab60e..e95b72ec19bc 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -33,7 +33,7 @@ #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 -#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 @@ -94,7 +94,7 @@ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ { EXIT_REASON_CPUID, "CPUID" }, \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4aea7d304beb..a77e92bd3f72 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2172,7 +2172,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * EXEC CONTROLS */ exec_control = vmx_exec_control(vmx); /* L0's desires */ - exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; @@ -3183,7 +3183,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, u32 exit_qual; evaluate_pending_interrupts = exec_controls_get(vmx) & - (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); + (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_VIRTUAL_NMI_PENDING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); @@ -3408,7 +3408,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && - !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && + !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; return kvm_vcpu_halt(vcpu); @@ -5524,8 +5524,8 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return false; case EXIT_REASON_TRIPLE_FAULT: return true; - case EXIT_REASON_PENDING_INTERRUPT: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); + case EXIT_REASON_INTERRUPT_WINDOW: + return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); case EXIT_REASON_NMI_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); case EXIT_REASON_TASK_SWITCH: @@ -6015,7 +6015,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | + CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5fb7a1695a24..0693dd0b5dbc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4350,7 +4350,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static void enable_irq_window(struct kvm_vcpu *vcpu) { - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -4969,7 +4969,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) static int handle_interrupt_window(struct kvm_vcpu *vcpu) { - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5203,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); intr_window_requested = exec_controls_get(vmx) & - CPU_BASED_VIRTUAL_INTR_PENDING; + CPU_BASED_INTR_WINDOW_EXITING; while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) @@ -5527,7 +5527,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_CPUID] = kvm_emulate_cpuid, [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, - [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, + [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, [EXIT_REASON_HLT] = kvm_emulate_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, @@ -5907,7 +5907,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return kvm_emulate_wrmsr(vcpu); else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) return handle_preemption_timer(vcpu); - else if (exit_reason == EXIT_REASON_PENDING_INTERRUPT) + else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) return handle_interrupt_window(vcpu); else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) return handle_external_interrupt(vcpu); diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index 3eb8411ab60e..e95b72ec19bc 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -33,7 +33,7 @@ #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 -#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 @@ -94,7 +94,7 @@ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ { EXIT_REASON_CPUID, "CPUID" }, \ diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index f52e0ba84fed..c6e442d7a241 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -18,7 +18,7 @@ /* * Definitions of Primary Processor-Based VM-Execution Controls. */ -#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 +#define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 #define CPU_BASED_INVLPG_EXITING 0x00000200 @@ -103,7 +103,7 @@ #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_TRIPLE_FAULT 2 -#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 -- cgit v1.2.3 From 4e2a0bc56ad197e5ccfab8395649b681067fe8cb Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Fri, 6 Dec 2019 16:45:25 +0800 Subject: KVM: VMX: Rename NMI_PENDING to NMI_WINDOW Rename the NMI-window exiting related definitions to match the latest Intel SDM. No functional changes. Signed-off-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 4 ++-- tools/testing/selftests/kvm/include/x86_64/vmx.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5acda8d9b9a7..06d4420508c5 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -31,7 +31,7 @@ #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 -#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 +#define CPU_BASED_NMI_WINDOW_EXITING 0x00400000 #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a77e92bd3f72..f8b9da53191e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2173,7 +2173,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) */ exec_control = vmx_exec_control(vmx); /* L0's desires */ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; - exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; @@ -2566,7 +2566,7 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) return -EINVAL; if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && - nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) + nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) return -EINVAL; return 0; @@ -3183,7 +3183,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, u32 exit_qual; evaluate_pending_interrupts = exec_controls_get(vmx) & - (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_VIRTUAL_NMI_PENDING); + (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); @@ -3407,7 +3407,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && - !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && + !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) && !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; @@ -5527,7 +5527,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_INTERRUPT_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); case EXIT_REASON_NMI_WINDOW: - return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); + return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: @@ -6016,7 +6016,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->procbased_ctls_high &= CPU_BASED_INTR_WINDOW_EXITING | - CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0693dd0b5dbc..51d8b2043dd0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4361,7 +4361,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) return; } - exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } static void vmx_inject_irq(struct kvm_vcpu *vcpu) @@ -5182,7 +5182,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(!enable_vnmi); - exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index c6e442d7a241..7eb38451c359 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -30,7 +30,7 @@ #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 -#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 +#define CPU_BASED_NMI_WINDOW_EXITING 0x00400000 #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 -- cgit v1.2.3 From 5e3d394fdd9e6b49cd8b28d85adff100a5bddc66 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Fri, 6 Dec 2019 16:45:26 +0800 Subject: KVM: VMX: Fix the spelling of CPU_BASED_USE_TSC_OFFSETTING The mis-spelling is found by checkpatch.pl, so fix them. Signed-off-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/kvm/vmx/nested.c | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 6 +++--- tools/testing/selftests/kvm/include/x86_64/vmx.h | 2 +- tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 06d4420508c5..d716fe938fc0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -20,7 +20,7 @@ * Definitions of Primary Processor-Based VM-Execution Controls. */ #define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 -#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_USE_TSC_OFFSETTING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 #define CPU_BASED_INVLPG_EXITING 0x00000200 #define CPU_BASED_MWAIT_EXITING 0x00000400 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f8b9da53191e..8c215da368b7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3230,7 +3230,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, } enter_guest_mode(vcpu); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset += vmcs12->tsc_offset; if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) @@ -3294,7 +3294,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * 26.7 "VM-entry failures during or after loading guest state". */ vmentry_fail_vmexit_guest_mode: - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); @@ -4209,7 +4209,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (nested_cpu_has_preemption_timer(vmcs12)) hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; if (likely(!vmx->fail)) { @@ -6016,7 +6016,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; msrs->procbased_ctls_high &= CPU_BASED_INTR_WINDOW_EXITING | - CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 51d8b2043dd0..b5a0c2e05825 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1716,7 +1716,7 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) return vcpu->arch.tsc_offset - vmcs12->tsc_offset; return vcpu->arch.tsc_offset; @@ -1734,7 +1734,7 @@ static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) * to the newly set TSC to get L2's TSC. */ if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) g_tsc_offset = vmcs12->tsc_offset; trace_kvm_write_tsc_offset(vcpu->vcpu_id, @@ -2353,7 +2353,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_USE_TSC_OFFSETTING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h index 7eb38451c359..3d27069b9ed9 100644 --- a/tools/testing/selftests/kvm/include/x86_64/vmx.h +++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h @@ -19,7 +19,7 @@ * Definitions of Primary Processor-Based VM-Execution Controls. */ #define CPU_BASED_INTR_WINDOW_EXITING 0x00000004 -#define CPU_BASED_USE_TSC_OFFSETING 0x00000008 +#define CPU_BASED_USE_TSC_OFFSETTING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 #define CPU_BASED_INVLPG_EXITING 0x00000200 #define CPU_BASED_MWAIT_EXITING 0x00000400 diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c index 5590fd2bcf87..69e482a95c47 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c @@ -98,7 +98,7 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); - control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETING; + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); -- cgit v1.2.3 From dd2d6042b7f4a5440705b4ffc6c4c2dba81a43b7 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 6 Dec 2019 15:46:35 -0800 Subject: kvm: nVMX: VMWRITE checks VMCS-link pointer before VMCS field According to the SDM, a VMWRITE in VMX non-root operation with an invalid VMCS-link pointer results in VMfailInvalid before the validity of the VMCS field in the secondary source operand is checked. For consistency, modify both handle_vmwrite and handle_vmread, even though there was no problem with the latter. Fixes: 6d894f498f5d1 ("KVM: nVMX: vmread/vmwrite: Use shadow vmcs12 if running L2") Signed-off-by: Jim Mattson Cc: Liran Alon Cc: Paolo Bonzini Cc: Vitaly Kuznetsov Reviewed-by: Peter Shier Reviewed-by: Oliver Upton Reviewed-by: Jon Cargille Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 59 ++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 34 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8c215da368b7..a3ec92c316f6 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4753,32 +4753,28 @@ static int handle_vmread(struct kvm_vcpu *vcpu) { unsigned long field; u64 field_value; + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); int len; gva_t gva = 0; - struct vmcs12 *vmcs12; + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); struct x86_exception e; short offset; if (!nested_vmx_check_permission(vcpu)) return 1; - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) + /* + * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * any VMREAD sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == -1ull || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) return nested_vmx_failInvalid(vcpu); - if (!is_guest_mode(vcpu)) - vmcs12 = get_vmcs12(vcpu); - else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMREAD - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } - /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); @@ -4855,13 +4851,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ u64 field_value = 0; struct x86_exception e; - struct vmcs12 *vmcs12; + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); short offset; if (!nested_vmx_check_permission(vcpu)) return 1; - if (vmx->nested.current_vmptr == -1ull) + /* + * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * any VMWRITE sets the ALU flags for VMfailInvalid. + */ + if (vmx->nested.current_vmptr == -1ull || + (is_guest_mode(vcpu) && + get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) return nested_vmx_failInvalid(vcpu); if (vmx_instruction_info & (1u << 10)) @@ -4889,24 +4892,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - if (!is_guest_mode(vcpu)) { - vmcs12 = get_vmcs12(vcpu); - - /* - * Ensure vmcs12 is up-to-date before any VMWRITE that dirties - * vmcs12, else we may crush a field or consume a stale value. - */ - if (!is_shadow_field_rw(field)) - copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - } else { - /* - * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE - * to shadowed-field sets the ALU flags for VMfailInvalid. - */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) - return nested_vmx_failInvalid(vcpu); - vmcs12 = get_shadow_vmcs12(vcpu); - } + /* + * Ensure vmcs12 is up-to-date before any VMWRITE that dirties + * vmcs12, else we may crush a field or consume a stale value. + */ + if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) + copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); offset = vmcs_field_to_offset(field); if (offset < 0) -- cgit v1.2.3 From 693e02cc24090c379217138719d9d84e50036b24 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 6 Dec 2019 15:46:36 -0800 Subject: kvm: nVMX: VMWRITE checks unsupported field before read-only field According to the SDM, VMWRITE checks to see if the secondary source operand corresponds to an unsupported VMCS field before it checks to see if the secondary source operand corresponds to a VM-exit information field and the processor does not support writing to VM-exit information fields. Fixes: 49f705c5324aa ("KVM: nVMX: Implement VMREAD and VMWRITE") Signed-off-by: Jim Mattson Cc: Paolo Bonzini Reviewed-by: Peter Shier Reviewed-by: Oliver Upton Reviewed-by: Jon Cargille Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a3ec92c316f6..e2fa5aefed29 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4883,6 +4883,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + offset = vmcs_field_to_offset(field); + if (offset < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); + /* * If the vCPU supports "VMWRITE to any supported field in the * VMCS," then the "read-only" fields are actually read/write. @@ -4899,11 +4905,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - offset = vmcs_field_to_offset(field); - if (offset < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); - /* * Some Intel CPUs intentionally drop the reserved bits of the AR byte * fields on VMWRITE. Emulate this behavior to ensure consistent KVM -- cgit v1.2.3 From c90f4d03cce1814b4e08372359116710bbaccce3 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 6 Dec 2019 15:46:37 -0800 Subject: kvm: nVMX: Aesthetic cleanup of handle_vmread and handle_vmwrite Apply reverse fir tree declaration order, shorten some variable names to avoid line wrap, reformat a block comment, delete an extra blank line, and use BIT(10) instead of (1u << 10). Signed-off-by: Jim Mattson Cc: Paolo Bonzini Cc: Sean Christopherson Reviewed-by: Peter Shier Reviewed-by: Oliver Upton Reviewed-by: Jon Cargille Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 70 +++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 36 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e2fa5aefed29..7b01ef1d87e6 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4751,17 +4751,17 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) static int handle_vmread(struct kvm_vcpu *vcpu) { - unsigned long field; - u64 field_value; - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - int len; - gva_t gva = 0; struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; + unsigned long field; + u64 value; + gva_t gva = 0; short offset; + int len; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4776,7 +4776,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return nested_vmx_failInvalid(vcpu); /* Decode instruction info and find the field to read */ - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); offset = vmcs_field_to_offset(field); if (offset < 0) @@ -4786,24 +4786,23 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - /* Read the field, zero-extended to a u64 field_value */ - field_value = vmcs12_read_any(vmcs12, field, offset); + /* Read the field, zero-extended to a u64 value */ + value = vmcs12_read_any(vmcs12, field, offset); /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending * on the guest's mode (32 or 64 bit), not on the given field's length. */ - if (vmx_instruction_info & (1u << 10)) { - kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), - field_value); + if (instr_info & BIT(10)) { + kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value); } else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, len, &gva)) + instr_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) + if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) kvm_inject_page_fault(vcpu, &e); } @@ -4836,24 +4835,25 @@ static bool is_shadow_field_ro(unsigned long field) static int handle_vmwrite(struct kvm_vcpu *vcpu) { + struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) + : get_vmcs12(vcpu); + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct x86_exception e; unsigned long field; - int len; + short offset; gva_t gva; - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + int len; - /* The value to write might be 32 or 64 bits, depending on L1's long + /* + * The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 - * bit (field_value), and then copies only the appropriate number of + * bit (value), and then copies only the appropriate number of * bits into the vmcs12 field. */ - u64 field_value = 0; - struct x86_exception e; - struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) - : get_vmcs12(vcpu); - short offset; + u64 value = 0; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -4867,22 +4867,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) return nested_vmx_failInvalid(vcpu); - if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_readl(vcpu, - (((vmx_instruction_info) >> 3) & 0xf)); + if (instr_info & BIT(10)) + value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf)); else { len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, false, len, &gva)) + instr_info, false, len, &gva)) return 1; - if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { + if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } } - - field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); offset = vmcs_field_to_offset(field); if (offset < 0) @@ -4914,9 +4912,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) * the stripped down value, L2 sees the full value as stored by KVM). */ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) - field_value &= 0x1f0ff; + value &= 0x1f0ff; - vmcs12_write_any(vmcs12, field, offset, field_value); + vmcs12_write_any(vmcs12, field, offset, value); /* * Do not track vmcs12 dirty-state if in guest-mode as we actually @@ -4933,7 +4931,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) preempt_disable(); vmcs_load(vmx->vmcs01.shadow_vmcs); - __vmcs_writel(field, field_value); + __vmcs_writel(field, value); vmcs_clear(vmx->vmcs01.shadow_vmcs); vmcs_load(vmx->loaded_vmcs->vmcs); -- cgit v1.2.3 From f958bd2314d117f8c29f4821401bc1925bc2e5ef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 9 Dec 2019 12:19:31 -0800 Subject: KVM: x86: Fix potential put_fpu() w/o load_fpu() on MPX platform Unlike most state managed by XSAVE, MPX is initialized to zero on INIT. Because INITs are usually recognized in the context of a VCPU_RUN call, kvm_vcpu_reset() puts the guest's FPU so that the FPU state is resident in memory, zeros the MPX state, and reloads FPU state to hardware. But, in the unlikely event that an INIT is recognized during kvm_arch_vcpu_ioctl_get_mpstate() via kvm_apic_accept_events(), kvm_vcpu_reset() will call kvm_put_guest_fpu() without a preceding kvm_load_guest_fpu() and corrupt the guest's FPU state (and possibly userspace's FPU state as well). Given that MPX is being removed from the kernel[*], fix the bug with the simple-but-ugly approach of loading the guest's FPU during KVM_GET_MP_STATE. [*] See commit f240652b6032b ("x86/mpx: Remove MPX APIs"). Fixes: f775b13eedee2 ("x86,kvm: move qemu/guest FPU switching out to vcpu_run") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3051324f72d3..0af5cb637bea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8714,6 +8714,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { vcpu_load(vcpu); + if (kvm_mpx_supported()) + kvm_load_guest_fpu(vcpu); kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && @@ -8722,6 +8724,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, else mp_state->mp_state = vcpu->arch.mp_state; + if (kvm_mpx_supported()) + kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); return 0; } -- cgit v1.2.3 From 95145c25a78cc0a9d3cbc75708abde432310c5a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 9 Dec 2019 12:05:17 -0800 Subject: KVM: x86: Add a WARN on TIF_NEED_FPU_LOAD in kvm_load_guest_fpu() WARN once in kvm_load_guest_fpu() if TIF_NEED_FPU_LOAD is observed, as that would mean that KVM is corrupting userspace's FPU by saving unknown register state into arch.user_fpu. Add a comment to explain why KVM WARNs on TIF_NEED_FPU_LOAD instead of implementing logic similar to fpu__copy(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0af5cb637bea..25aac4c81b12 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8476,6 +8476,13 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); + /* + * Reloading userspace's FPU is handled by kvm_arch_vcpu_load(), both + * for direct calls from userspace (via vcpu_load()) and if this task + * is preempted (via kvm_sched_in()) between vcpu_load() and now. + */ + WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); + copy_fpregs_to_fpstate(vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, -- cgit v1.2.3 From 736c291c9f36b07f8889c61764c28edce20e715d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:14 -0800 Subject: KVM: x86: Use gpa_t for cr2/gpa to fix TDP support on 32-bit KVM Convert a plethora of parameters and variables in the MMU and page fault flows from type gva_t to gpa_t to properly handle TDP on 32-bit KVM. Thanks to PSE and PAE paging, 32-bit kernels can access 64-bit physical addresses. When TDP is enabled, the fault address is a guest physical address and thus can be a 64-bit value, even when both KVM and its guest are using 32-bit virtual addressing, e.g. VMX's VMCS.GUEST_PHYSICAL is a 64-bit field, not a natural width field. Using a gva_t for the fault address means KVM will incorrectly drop the upper 32-bits of the GPA. Ditto for gva_to_gpa() when it is used to translate L2 GPAs to L1 GPAs. Opportunistically rename variables and parameters to better reflect the dual address modes, e.g. use "cr2_or_gpa" for fault addresses and plain "addr" instead of "vaddr" when the address may be either a GVA or an L2 GPA. Similarly, use "gpa" in the nonpaging_page_fault() flows to avoid a confusing "gpa_t gva" declaration; this also sets the stage for a future patch to combing nonpaging_page_fault() and tdp_page_fault() with minimal churn. Sprinkle in a few comments to document flows where an address is known to be a GVA and thus can be safely truncated to a 32-bit value. Add WARNs in kvm_handle_page_fault() and FNAME(gva_to_gpa_nested)() to help document such cases and detect bugs. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 ++--- arch/x86/kvm/mmu/mmu.c | 69 ++++++++++++++++++++++++----------------- arch/x86/kvm/mmu/paging_tmpl.h | 25 +++++++++------ arch/x86/kvm/mmutrace.h | 12 +++---- arch/x86/kvm/x86.c | 40 ++++++++++++------------ arch/x86/kvm/x86.h | 2 +- include/linux/kvm_host.h | 6 ++-- virt/kvm/async_pf.c | 10 +++--- 8 files changed, 94 insertions(+), 78 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2893eae5df9f..159a28512e4c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -378,12 +378,12 @@ struct kvm_mmu { void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); - int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, + int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, bool prefault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, - struct x86_exception *exception); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa, + u32 access, struct x86_exception *exception); gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -1473,7 +1473,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c19f3ccaace3..2cb199817837 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3532,7 +3532,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) * - true: let the vcpu to access on the same address again. * - false: let the real page fault path to fix it. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, +static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, u32 error_code) { struct kvm_shadow_walk_iterator iterator; @@ -3552,7 +3552,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) if (!is_shadow_present_pte(spte) || iterator.level < level) break; @@ -3630,7 +3630,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, } while (true); - trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, + trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, spte, fault_handled); walk_shadow_page_lockless_end(vcpu); @@ -3638,10 +3638,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, + bool *writable); static int make_mmu_pages_available(struct kvm_vcpu *vcpu); -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, +static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn, bool prefault) { int r; @@ -3667,16 +3668,16 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); } - if (fast_page_fault(vcpu, v, level, error_code)) + if (fast_page_fault(vcpu, gpa, level, error_code)) return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) return RET_PF_RETRY; - if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) + if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r)) return r; r = RET_PF_RETRY; @@ -3687,7 +3688,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, pfn, + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -3985,7 +3986,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { if (exception) @@ -3993,7 +3994,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, return vaddr; } -static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, +static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { @@ -4153,13 +4154,14 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { - gfn_t gfn = gva >> PAGE_SHIFT; + gfn_t gfn = gpa >> PAGE_SHIFT; int r; - pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); + /* Note, paging is disabled, ergo gva == gpa. */ + pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4171,11 +4173,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - return nonpaging_map(vcpu, gva & PAGE_MASK, + return nonpaging_map(vcpu, gpa & PAGE_MASK, error_code, gfn, prefault); } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) +static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) { struct kvm_arch_async_pf arch; @@ -4184,11 +4187,13 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu->direct_map; arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); + return kvm_setup_async_pf(vcpu, cr2_or_gpa, + kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, + bool *writable) { struct kvm_memory_slot *slot; bool async; @@ -4208,12 +4213,12 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; /* *pfn has correct page already */ if (!prefault && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(gva, gfn); + trace_kvm_try_async_get_page(cr2_or_gpa, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(gva, gfn); + trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return true; - } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) + } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) return true; } @@ -4226,6 +4231,12 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, { int r = 1; +#ifndef CONFIG_X86_64 + /* A 64-bit CR2 should be impossible on 32-bit KVM. */ + if (WARN_ON_ONCE(fault_address >> 32)) + return -EFAULT; +#endif + vcpu->arch.l1tf_flush_l1d = true; switch (vcpu->arch.apf.host_apf_reason) { default: @@ -4263,7 +4274,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); } -static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, +static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { kvm_pfn_t pfn; @@ -5520,7 +5531,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) return 0; } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { int r, emulation_type = 0; @@ -5529,18 +5540,18 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, /* With shadow page tables, fault_address contains a GVA or nGPA. */ if (vcpu->arch.mmu->direct_map) { vcpu->arch.gpa_available = true; - vcpu->arch.gpa_val = cr2; + vcpu->arch.gpa_val = cr2_or_gpa; } r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, cr2, direct); + r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { - r = vcpu->arch.mmu->page_fault(vcpu, cr2, + r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); WARN_ON(r == RET_PF_INVALID); @@ -5560,7 +5571,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, */ if (vcpu->arch.mmu->direct_map && (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); return 1; } @@ -5575,7 +5586,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, * explicitly shadowing L1's page tables, i.e. unprotecting something * for L1 isn't going to magically fix whatever issue cause L2 to fail. */ - if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) + if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) emulation_type = EMULTYPE_ALLOW_RETRY; emulate: /* @@ -5590,7 +5601,7 @@ emulate: return 1; } - return x86_emulate_instruction(vcpu, cr2, emulation_type, insn, + return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 97b21e7fd013..c1d7b866a03f 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -291,11 +291,11 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) } /* - * Fetch a guest pte for a guest virtual address + * Fetch a guest pte for a guest virtual address, or for an L2's GPA. */ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t addr, u32 access) + gpa_t addr, u32 access) { int ret; pt_element_t pte; @@ -496,7 +496,7 @@ error: } static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, u32 access) + struct kvm_vcpu *vcpu, gpa_t addr, u32 access) { return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, access); @@ -611,7 +611,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case. */ -static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct guest_walker *gw, int write_fault, int hlevel, kvm_pfn_t pfn, bool map_writable, bool prefault, @@ -765,7 +765,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool prefault) { int write_fault = error_code & PFERR_WRITE_MASK; @@ -945,18 +945,19 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, +/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, struct x86_exception *exception) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); + r = FNAME(walk_addr)(&walker, vcpu, addr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; + gpa |= addr & ~PAGE_MASK; } else if (exception) *exception = walker.fault; @@ -964,7 +965,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, } #if PTTYPE != PTTYPE_EPT -static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, +/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ +static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { @@ -972,6 +974,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, gpa_t gpa = UNMAPPED_GVA; int r; +#ifndef CONFIG_X86_64 + /* A 64-bit GVA should be impossible on 32-bit KVM. */ + WARN_ON_ONCE(vaddr >> 32); +#endif + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); if (r) { diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 7ca8831c7d1a..3c6522b84ff1 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -249,13 +249,13 @@ TRACE_EVENT( TRACE_EVENT( fast_page_fault, - TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, u64 *sptep, u64 old_spte, bool retry), - TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), + TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry), TP_STRUCT__entry( __field(int, vcpu_id) - __field(gva_t, gva) + __field(gpa_t, cr2_or_gpa) __field(u32, error_code) __field(u64 *, sptep) __field(u64, old_spte) @@ -265,7 +265,7 @@ TRACE_EVENT( TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->gva = gva; + __entry->cr2_or_gpa = cr2_or_gpa; __entry->error_code = error_code; __entry->sptep = sptep; __entry->old_spte = old_spte; @@ -273,9 +273,9 @@ TRACE_EVENT( __entry->retry = retry; ), - TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" + TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx" " new %llx spurious %d fixed %d", __entry->vcpu_id, - __entry->gva, __print_flags(__entry->error_code, "|", + __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|", kvm_mmu_trace_pferr_flags), __entry->sptep, __entry->old_spte, __entry->new_spte, __spte_satisfied(old_spte), __spte_satisfied(new_spte) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 25aac4c81b12..93bbbce67a03 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6379,11 +6379,11 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool write_fault_to_shadow_pgtable, int emulation_type) { - gpa_t gpa = cr2; + gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) @@ -6397,7 +6397,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, * Write permission should be allowed since only * write access need to be emulated. */ - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); /* * If the mapping is invalid in guest, let cpu retry @@ -6454,10 +6454,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - unsigned long cr2, int emulation_type) + gpa_t cr2_or_gpa, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2; + unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; last_retry_eip = vcpu->arch.last_retry_eip; last_retry_addr = vcpu->arch.last_retry_addr; @@ -6486,14 +6486,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (x86_page_table_writing_insn(ctxt)) return false; - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) + if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) return false; vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2; + vcpu->arch.last_retry_addr = cr2_or_gpa; if (!vcpu->arch.mmu->direct_map) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -6639,11 +6639,8 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } -int x86_emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, - int emulation_type, - void *insn, - int insn_len) +int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + int emulation_type, void *insn, int insn_len) { int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; @@ -6689,8 +6686,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, - emulation_type)) + if (reexecute_instruction(vcpu, cr2_or_gpa, + write_fault_to_spt, + emulation_type)) return 1; if (ctxt->have_exception) { /* @@ -6724,7 +6722,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return 1; } - if (retry_instruction(ctxt, cr2, emulation_type)) + if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) return 1; /* this is needed for vmware backdoor interface to work since it @@ -6736,7 +6734,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, restart: /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2; + ctxt->exception.address = cr2_or_gpa; r = x86_emulate_insn(ctxt); @@ -6744,7 +6742,7 @@ restart: return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, + if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, emulation_type)) return 1; @@ -10025,7 +10023,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) return; - vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true); + vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true); } static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) @@ -10138,7 +10136,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, { struct x86_exception fault; - trace_kvm_async_pf_not_present(work->arch.token, work->gva); + trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); if (kvm_can_deliver_async_pf(vcpu) && @@ -10173,7 +10171,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); - trace_kvm_async_pf_ready(work->arch.token, work->gva); + trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && !apf_get_user(vcpu, &val)) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 29391af8871d..cab5e71f0f0f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -289,7 +289,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); -int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, +int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0d632a75fce9..528ab7a814ab 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -204,7 +204,7 @@ struct kvm_async_pf { struct list_head queue; struct kvm_vcpu *vcpu; struct mm_struct *mm; - gva_t gva; + gpa_t cr2_or_gpa; unsigned long addr; struct kvm_arch_async_pf arch; bool wakeup_all; @@ -212,8 +212,8 @@ struct kvm_async_pf { void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, - struct kvm_arch_async_pf *arch); +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 35305d6e68cc..d8ef708a2ef6 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -64,7 +64,7 @@ static void async_pf_execute(struct work_struct *work) struct mm_struct *mm = apf->mm; struct kvm_vcpu *vcpu = apf->vcpu; unsigned long addr = apf->addr; - gva_t gva = apf->gva; + gpa_t cr2_or_gpa = apf->cr2_or_gpa; int locked = 1; might_sleep(); @@ -92,7 +92,7 @@ static void async_pf_execute(struct work_struct *work) * this point */ - trace_kvm_async_pf_completed(addr, gva); + trace_kvm_async_pf_completed(addr, cr2_or_gpa); if (swq_has_sleeper(&vcpu->wq)) swake_up_one(&vcpu->wq); @@ -165,8 +165,8 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) } } -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, - struct kvm_arch_async_pf *arch) +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch) { struct kvm_async_pf *work; @@ -185,7 +185,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva, work->wakeup_all = false; work->vcpu = vcpu; - work->gva = gva; + work->cr2_or_gpa = cr2_or_gpa; work->addr = hva; work->arch = *arch; work->mm = current->mm; -- cgit v1.2.3 From ba7888dde6afc32885a0960f11b898ff97d4a060 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:15 -0800 Subject: KVM: x86/mmu: Move definition of make_mmu_pages_available() up Move make_mmu_pages_available() above its first user to put it closer to related code and eliminate a forward declaration. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2cb199817837..638618c384d0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2903,6 +2903,26 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); } +static int make_mmu_pages_available(struct kvm_vcpu *vcpu) +{ + LIST_HEAD(invalid_list); + + if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + return 0; + + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { + if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) + break; + + ++vcpu->kvm->stat.mmu_recycled; + } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + + if (!kvm_mmu_available_pages(vcpu->kvm)) + return -ENOSPC; + return 0; +} + /* * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock @@ -3640,7 +3660,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, bool *writable); -static int make_mmu_pages_available(struct kvm_vcpu *vcpu); static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn, bool prefault) @@ -5511,26 +5530,6 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); -static int make_mmu_pages_available(struct kvm_vcpu *vcpu) -{ - LIST_HEAD(invalid_list); - - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) - return 0; - - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { - if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) - break; - - ++vcpu->kvm->stat.mmu_recycled; - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - - if (!kvm_mmu_available_pages(vcpu->kvm)) - return -ENOSPC; - return 0; -} - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { -- cgit v1.2.3 From 367fd790b17dbe2847c29099cf9902ded207901c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:16 -0800 Subject: KVM: x86/mmu: Fold nonpaging_map() into nonpaging_page_fault() Fold nonpaging_map() into its sole caller, nonpaging_page_fault(), in preparation for combining the bulk of nonpaging_page_fault() and tdp_page_fault() into a common helper. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 106 +++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 57 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 638618c384d0..a1d1fc21aa8d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3661,60 +3661,6 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, bool *writable); -static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - gfn_t gfn, bool prefault) -{ - int r; - int level; - bool force_pt_level; - kvm_pfn_t pfn; - unsigned long mmu_seq; - bool map_writable, write = error_code & PFERR_WRITE_MASK; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); - - force_pt_level = lpage_disallowed; - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - /* - * This path builds a PAE pagetable - so we can map - * 2mb pages at maximum. Therefore check if the level - * is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } - - if (fast_page_fault(vcpu, gpa, level, error_code)) - return RET_PF_RETRY; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return RET_PF_RETRY; - - if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r)) - return r; - - r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) - goto out_unlock; - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, - prefault, false); -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return r; -} - static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, struct list_head *invalid_list) { @@ -4176,12 +4122,21 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { - gfn_t gfn = gpa >> PAGE_SHIFT; int r; + int level; + kvm_pfn_t pfn; + unsigned long mmu_seq; + gfn_t gfn = gpa >> PAGE_SHIFT; + bool write = error_code & PFERR_WRITE_MASK; + bool force_pt_level, map_writable; + bool exec = error_code & PFERR_FETCH_MASK; + bool lpage_disallowed = exec && is_nx_huge_page_enabled(); /* Note, paging is disabled, ergo gva == gpa. */ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + gpa &= PAGE_MASK; + if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4191,9 +4146,46 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); + force_pt_level = lpage_disallowed; + level = mapping_level(vcpu, gfn, &force_pt_level); + if (likely(!force_pt_level)) { + /* + * This path builds a PAE pagetable - so we can map + * 2mb pages at maximum. Therefore check if the level + * is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + } + + if (fast_page_fault(vcpu, gpa, level, error_code)) + return RET_PF_RETRY; + + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + return RET_PF_RETRY; + + if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r)) + return r; - return nonpaging_map(vcpu, gpa & PAGE_MASK, - error_code, gfn, prefault); + r = RET_PF_RETRY; + spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) + goto out_unlock; + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; + if (likely(!force_pt_level)) + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, false); +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return r; } static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, -- cgit v1.2.3 From 9f1a8526fbe3e82afae565fd008f2ca2035662e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:17 -0800 Subject: KVM: x86/mmu: Move nonpaging_page_fault() below try_async_pf() Move nonpaging_page_fault() below try_async_pf() to eliminate the forward declaration of try_async_pf() and to prepare for combining the bulk of nonpaging_page_fault() and tdp_page_fault() into a common helper. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 102 ++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 53 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a1d1fc21aa8d..b3633067143e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3657,10 +3657,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, return fault_handled; } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, - bool *writable); - static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, struct list_head *invalid_list) { @@ -4119,6 +4115,55 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } +static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) +{ + struct kvm_arch_async_pf arch; + + arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; + arch.gfn = gfn; + arch.direct_map = vcpu->arch.mmu->direct_map; + arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); + + return kvm_setup_async_pf(vcpu, cr2_or_gpa, + kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); +} + +static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, + gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, + bool *writable) +{ + struct kvm_memory_slot *slot; + bool async; + + /* + * Don't expose private memslots to L2. + */ + if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) { + *pfn = KVM_PFN_NOSLOT; + return false; + } + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + async = false; + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); + if (!async) + return false; /* *pfn has correct page already */ + + if (!prefault && kvm_can_do_async_pf(vcpu)) { + trace_kvm_try_async_get_page(cr2_or_gpa, gfn); + if (kvm_find_async_pf_gfn(vcpu, gfn)) { + trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + return true; + } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) + return true; + } + + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); + return false; +} + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { @@ -4188,55 +4233,6 @@ out_unlock: return r; } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) -{ - struct kvm_arch_async_pf arch; - - arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; - arch.gfn = gfn; - arch.direct_map = vcpu->arch.mmu->direct_map; - arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); - - return kvm_setup_async_pf(vcpu, cr2_or_gpa, - kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); -} - -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write, - bool *writable) -{ - struct kvm_memory_slot *slot; - bool async; - - /* - * Don't expose private memslots to L2. - */ - if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) { - *pfn = KVM_PFN_NOSLOT; - return false; - } - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); - if (!async) - return false; /* *pfn has correct page already */ - - if (!prefault && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(cr2_or_gpa, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; - } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) - return true; - } - - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); - return false; -} - int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { -- cgit v1.2.3 From cb9b88c669396e3e5f957fe909ff901b51321013 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:18 -0800 Subject: KVM: x86/mmu: Refactor handling of cache consistency with TDP Pre-calculate the max level for a TDP page with respect to MTRR cache consistency in preparation of replacing force_pt_level with max_level, and eventually combining the bulk of nonpaging_page_fault() and tdp_page_fault() into a common helper. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b3633067143e..defe94ecd0a4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4271,16 +4271,6 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); -static bool -check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) -{ - int page_num = KVM_PAGES_PER_HPAGE(level); - - gfn &= ~(page_num - 1); - - return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); -} - static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { @@ -4294,6 +4284,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool map_writable; bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && is_nx_huge_page_enabled(); + int max_level; MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); @@ -4304,14 +4295,21 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (r) return r; - force_pt_level = - lpage_disallowed || - !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); + for (max_level = PT_MAX_HUGEPAGE_LEVEL; + max_level > PT_PAGE_TABLE_LEVEL; + max_level--) { + int page_num = KVM_PAGES_PER_HPAGE(max_level); + gfn_t base = gfn & ~(page_num - 1); + + if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) + break; + } + + force_pt_level = lpage_disallowed || max_level == PT_PAGE_TABLE_LEVEL; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - if (level > PT_DIRECTORY_LEVEL && - !check_hugepage_cache_consistency(vcpu, gfn, level)) - level = PT_DIRECTORY_LEVEL; + if (level > max_level) + level = max_level; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); } -- cgit v1.2.3 From f0f37e229c0517fa0d8bda73a2aeee28260370a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:19 -0800 Subject: KVM: x86/mmu: Refactor the per-slot level calculation in mapping_level() Invert the loop which adjusts the allowed page level based on what's compatible with the associated memslot to use a largest-to-smallest page size walk. This paves the way for passing around a "max level" variable instead of having redundant checks and/or multiple booleans. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index defe94ecd0a4..8db2bb050809 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1330,7 +1330,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, bool *force_pt_level) { - int host_level, level, max_level; + int host_level, max_level; struct kvm_memory_slot *slot; if (unlikely(*force_pt_level)) @@ -1347,12 +1347,12 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, return host_level; max_level = min(kvm_x86_ops->get_lpage_level(), host_level); - - for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) + for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { + if (!__mmu_gfn_lpage_is_disallowed(large_gfn, max_level, slot)) break; + } - return level - 1; + return max_level; } /* -- cgit v1.2.3 From 39ca1ecb784b29965fd780bed1e8a3792a086a29 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:20 -0800 Subject: KVM: x86/mmu: Refactor handling of forced 4k pages in page faults Refactor the page fault handlers and mapping_level() to track the max allowed page level instead of only tracking if a 4k page is mandatory due to one restriction or another. This paves the way for cleanly consolidating tdp_page_fault() and nonpaging_page_fault(), and for eliminating a redundant check on mmu_gfn_lpage_is_disallowed(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 45 ++++++++++++++++++------------------------ arch/x86/kvm/mmu/paging_tmpl.h | 16 +++++++++------ 2 files changed, 29 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8db2bb050809..daf41806243f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1328,18 +1328,19 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, } static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, - bool *force_pt_level) + int *max_levelp) { - int host_level, max_level; + int host_level, max_level = *max_levelp; struct kvm_memory_slot *slot; - if (unlikely(*force_pt_level)) + if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) return PT_PAGE_TABLE_LEVEL; slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); - *force_pt_level = !memslot_valid_for_gpte(slot, true); - if (unlikely(*force_pt_level)) + if (!memslot_valid_for_gpte(slot, true)) { + *max_levelp = PT_PAGE_TABLE_LEVEL; return PT_PAGE_TABLE_LEVEL; + } host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -4173,9 +4174,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long mmu_seq; gfn_t gfn = gpa >> PAGE_SHIFT; bool write = error_code & PFERR_WRITE_MASK; - bool force_pt_level, map_writable; + bool map_writable; bool exec = error_code & PFERR_FETCH_MASK; bool lpage_disallowed = exec && is_nx_huge_page_enabled(); + int max_level; /* Note, paging is disabled, ergo gva == gpa. */ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); @@ -4191,19 +4193,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - force_pt_level = lpage_disallowed; - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - /* - * This path builds a PAE pagetable - so we can map - * 2mb pages at maximum. Therefore check if the level - * is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; + /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ + max_level = lpage_disallowed ? PT_PAGE_TABLE_LEVEL : PT_DIRECTORY_LEVEL; + level = mapping_level(vcpu, gfn, &max_level); + if (level > PT_PAGE_TABLE_LEVEL) gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } if (fast_page_fault(vcpu, gpa, level, error_code)) return RET_PF_RETRY; @@ -4223,7 +4218,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (likely(!force_pt_level)) + if (likely(max_level > PT_PAGE_TABLE_LEVEL)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, false); @@ -4277,7 +4272,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, kvm_pfn_t pfn; int r; int level; - bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -4305,13 +4299,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, break; } - force_pt_level = lpage_disallowed || max_level == PT_PAGE_TABLE_LEVEL; - level = mapping_level(vcpu, gfn, &force_pt_level); - if (likely(!force_pt_level)) { - if (level > max_level) - level = max_level; + if (lpage_disallowed) + max_level = PT_PAGE_TABLE_LEVEL; + + level = mapping_level(vcpu, gfn, &max_level); + if (level > PT_PAGE_TABLE_LEVEL) gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } if (fast_page_fault(vcpu, gpa, level, error_code)) return RET_PF_RETRY; @@ -4331,7 +4324,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (likely(!force_pt_level)) + if (likely(max_level > PT_PAGE_TABLE_LEVEL)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, lpage_disallowed); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index c1d7b866a03f..1938a6e4e631 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -778,7 +778,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, bool map_writable, is_self_change_mapping; bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && is_nx_huge_page_enabled(); - bool force_pt_level = lpage_disallowed; + int max_level; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -818,14 +818,18 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + max_level = lpage_disallowed ? PT_PAGE_TABLE_LEVEL : + PT_MAX_HUGEPAGE_LEVEL; + if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - level = mapping_level(vcpu, walker.gfn, &force_pt_level); - if (likely(!force_pt_level)) { + level = mapping_level(vcpu, walker.gfn, &max_level); + if (likely(max_level > PT_DIRECTORY_LEVEL)) { level = min(walker.level, level); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } - } else - force_pt_level = true; + } else { + max_level = PT_PAGE_TABLE_LEVEL; + } mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -865,7 +869,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (!force_pt_level) + if (max_level > PT_PAGE_TABLE_LEVEL) transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, pfn, map_writable, prefault, lpage_disallowed); -- cgit v1.2.3 From cbe1e6f035523b5fd29e44e18b82081b33d1f3f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:21 -0800 Subject: KVM: x86/mmu: Incorporate guest's page level into max level for shadow MMU Restrict the max level for a shadow page based on the guest's level instead of capping the level after the fact for host-mapped huge pages, e.g. hugetlbfs pages. Explicitly capping the max level using the guest mapping level also eliminates FNAME(page_fault)'s subtle dependency on THP only supporting 2mb pages. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 1938a6e4e631..7d57ec576df0 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -773,7 +773,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; - int level = PT_PAGE_TABLE_LEVEL; + int level; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && @@ -818,18 +818,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); - max_level = lpage_disallowed ? PT_PAGE_TABLE_LEVEL : - PT_MAX_HUGEPAGE_LEVEL; - - if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) { - level = mapping_level(vcpu, walker.gfn, &max_level); - if (likely(max_level > PT_DIRECTORY_LEVEL)) { - level = min(walker.level, level); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } - } else { + if (lpage_disallowed || is_self_change_mapping) max_level = PT_PAGE_TABLE_LEVEL; - } + else + max_level = walker.level; + + level = mapping_level(vcpu, walker.gfn, &max_level); + if (level > PT_PAGE_TABLE_LEVEL) + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); -- cgit v1.2.3 From 2f57b7051fe8fa680b7c38c7e98094fa3ba3ba8b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:22 -0800 Subject: KVM: x86/mmu: Persist gfn_lpage_is_disallowed() to max_level Persist the max page level calculated via gfn_lpage_is_disallowed() to the max level "returned" by mapping_level() so that its naturally taken into account by the max level check that conditions calling transparent_hugepage_adjust(). Drop the gfn_lpage_is_disallowed() check in thp_adjust() as it's now handled by mapping_level() and its callers. Add a comment to document the behavior of host_mapping_level() and its interaction with max level and transparent huge pages. Note, transferring the gfn_lpage_is_disallowed() from thp_adjust() to mapping_level() superficially affects how changes to a memslot's disallow_lpage count will be handled due to thp_adjust() being run while holding mmu_lock. In the more common case where a different vCPU increments the count via account_shadowed(), gfn_lpage_is_disallowed() is rechecked by set_spte() to ensure a writable large page isn't created. In the less common case where the count is decremented to zero due to all shadow pages in the memslot being zapped, THP behavior now matches hugetlbfs behavior in the sense that a small page will be created when a large page could be used if the count reaches zero in the miniscule window between mapping_level() and acquiring mmu_lock. Lastly, the new THP behavior also follows hugetlbfs behavior in the absurdly unlikely scenario of a memslot being moved such that the memslot's compatibility with respect to large pages changes, but without changing the validity of the gpf->pfn walk. I.e. if a memslot is moved between mapping_level() and snapshotting mmu_seq, it's theoretically possible to consume a stale disallow_lpage count. But, since KVM zaps all shadow pages when moving a memslot and forces all vCPUs to reload a new MMU, the inserted spte will always be thrown away prior to completing the memslot move, i.e. whether or not the spte accurately reflects disallow_lpage is irrelevant. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index daf41806243f..7cfacfe8548e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1330,7 +1330,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, int *max_levelp) { - int host_level, max_level = *max_levelp; + int max_level = *max_levelp; struct kvm_memory_slot *slot; if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) @@ -1342,18 +1342,27 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, return PT_PAGE_TABLE_LEVEL; } - host_level = host_mapping_level(vcpu->kvm, large_gfn); - - if (host_level == PT_PAGE_TABLE_LEVEL) - return host_level; - - max_level = min(kvm_x86_ops->get_lpage_level(), host_level); + max_level = min(max_level, kvm_x86_ops->get_lpage_level()); for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { if (!__mmu_gfn_lpage_is_disallowed(large_gfn, max_level, slot)) break; } - return max_level; + *max_levelp = max_level; + + if (max_level == PT_PAGE_TABLE_LEVEL) + return PT_PAGE_TABLE_LEVEL; + + /* + * Note, host_mapping_level() does *not* handle transparent huge pages. + * As suggested by "mapping", it reflects the page size established by + * the associated vma, if there is one, i.e. host_mapping_level() will + * return a huge page level if and only if a vma exists and the backing + * implementation for the vma uses huge pages, e.g. hugetlbfs and dax. + * So, do not propagate host_mapping_level() to max_level as KVM can + * still promote the guest mapping to a huge page in the THP case. + */ + return host_mapping_level(vcpu->kvm, large_gfn); } /* @@ -3424,8 +3433,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompoundMap(pfn_to_page(pfn)) && - !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { + PageTransCompoundMap(pfn_to_page(pfn))) { unsigned long mask; /* * mmu_notifier_retry was successful and we hold the -- cgit v1.2.3 From 2cb70fd441b6b40ffd9ee1782a972f149ba72158 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:23 -0800 Subject: KVM: x86/mmu: Rename lpage_disallowed to account_disallowed_nx_lpage Rename __direct_map()'s param that controls whether or not a disallowed NX large page should be accounted to match what it actually does. The nonpaging_page_fault() case unconditionally passes %false for the param even though it locally sets lpage_disallowed. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7cfacfe8548e..535598578647 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3353,7 +3353,7 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, int map_writable, int level, kvm_pfn_t pfn, - bool prefault, bool lpage_disallowed) + bool prefault, bool account_disallowed_nx_lpage) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3382,7 +3382,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (lpage_disallowed) + if (account_disallowed_nx_lpage) account_huge_nx_page(vcpu->kvm, sp); } } -- cgit v1.2.3 From 0f90e1c10dca5149529b4af6d94d7804ac2ef37b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:24 -0800 Subject: KVM: x86/mmu: Consolidate tdp_page_fault() and nonpaging_page_fault() Consolidate the direct MMU page fault handlers into a common helper, direct_page_fault(). Except for unique max level conditions, the tdp and nonpaging fault handlers are functionally identical. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 98 ++++++++++++++------------------------------------ 1 file changed, 27 insertions(+), 71 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 535598578647..313f77a34262 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4173,24 +4173,20 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; } -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, - u32 error_code, bool prefault) +static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + bool prefault, int max_level, bool is_tdp) { - int r; - int level; - kvm_pfn_t pfn; - unsigned long mmu_seq; - gfn_t gfn = gpa >> PAGE_SHIFT; bool write = error_code & PFERR_WRITE_MASK; - bool map_writable; bool exec = error_code & PFERR_FETCH_MASK; bool lpage_disallowed = exec && is_nx_huge_page_enabled(); - int max_level; + bool map_writable; - /* Note, paging is disabled, ergo gva == gpa. */ - pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + gfn_t gfn = gpa >> PAGE_SHIFT; + unsigned long mmu_seq; + kvm_pfn_t pfn; + int level, r; - gpa &= PAGE_MASK; + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4199,10 +4195,8 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, if (r) return r; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - - /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ - max_level = lpage_disallowed ? PT_PAGE_TABLE_LEVEL : PT_DIRECTORY_LEVEL; + if (lpage_disallowed) + max_level = PT_PAGE_TABLE_LEVEL; level = mapping_level(vcpu, gfn, &max_level); if (level > PT_PAGE_TABLE_LEVEL) @@ -4217,7 +4211,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) return RET_PF_RETRY; - if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r)) + if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; r = RET_PF_RETRY; @@ -4228,14 +4222,25 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, goto out_unlock; if (likely(max_level > PT_PAGE_TABLE_LEVEL)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, - prefault, false); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, + is_tdp && lpage_disallowed); + out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); return r; } +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, + u32 error_code, bool prefault) +{ + pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + + /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ + return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, + PT_DIRECTORY_LEVEL, false); +} + int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { @@ -4277,69 +4282,20 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault); static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault) { - kvm_pfn_t pfn; - int r; - int level; - gfn_t gfn = gpa >> PAGE_SHIFT; - unsigned long mmu_seq; - int write = error_code & PFERR_WRITE_MASK; - bool map_writable; - bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && - is_nx_huge_page_enabled(); int max_level; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - - if (page_fault_handle_page_track(vcpu, error_code, gfn)) - return RET_PF_EMULATE; - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - for (max_level = PT_MAX_HUGEPAGE_LEVEL; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { int page_num = KVM_PAGES_PER_HPAGE(max_level); - gfn_t base = gfn & ~(page_num - 1); + gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; } - if (lpage_disallowed) - max_level = PT_PAGE_TABLE_LEVEL; - - level = mapping_level(vcpu, gfn, &max_level); - if (level > PT_PAGE_TABLE_LEVEL) - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - - if (fast_page_fault(vcpu, gpa, level, error_code)) - return RET_PF_RETRY; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return RET_PF_RETRY; - - if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) - return r; - - r = RET_PF_RETRY; - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) - goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) - goto out_unlock; - if (likely(max_level > PT_PAGE_TABLE_LEVEL)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, - prefault, lpage_disallowed); -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return r; + return direct_page_fault(vcpu, gpa, error_code, prefault, + max_level, true); } static void nonpaging_init_context(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 0885904d4ff7e2d926caf743537ddd411ff22bfa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:25 -0800 Subject: KVM: x86/mmu: Move transparent_hugepage_adjust() above __direct_map() Move thp_adjust() above __direct_map() in preparation of calling thp_adjust() from __direct_map() and FNAME(fetch). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 76 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 313f77a34262..f8da3965c3e9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3328,6 +3328,44 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, + gfn_t gfn, kvm_pfn_t *pfnp, + int *levelp) +{ + kvm_pfn_t pfn = *pfnp; + int level = *levelp; + + /* + * Check if it's a transparent hugepage. If this would be an + * hugetlbfs page, level wouldn't be set to + * PT_PAGE_TABLE_LEVEL and there would be no adjustment done + * here. + */ + if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && + PageTransCompoundMap(pfn_to_page(pfn))) { + unsigned long mask; + /* + * mmu_notifier_retry was successful and we hold the + * mmu_lock here, so the pmd can't become splitting + * from under us, and in turn + * __split_huge_page_refcount() can't run from under + * us and we can safely transfer the refcount from + * PG_tail to PG_head as we switch the pfn to tail to + * head. + */ + *levelp = level = PT_DIRECTORY_LEVEL; + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + if (pfn & mask) { + kvm_release_pfn_clean(pfn); + pfn &= ~mask; + kvm_get_pfn(pfn); + *pfnp = pfn; + } + } +} + static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) { @@ -3418,44 +3456,6 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) return -EFAULT; } -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t gfn, kvm_pfn_t *pfnp, - int *levelp) -{ - kvm_pfn_t pfn = *pfnp; - int level = *levelp; - - /* - * Check if it's a transparent hugepage. If this would be an - * hugetlbfs page, level wouldn't be set to - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done - * here. - */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompoundMap(pfn_to_page(pfn))) { - unsigned long mask; - /* - * mmu_notifier_retry was successful and we hold the - * mmu_lock here, so the pmd can't become splitting - * from under us, and in turn - * __split_huge_page_refcount() can't run from under - * us and we can safely transfer the refcount from - * PG_tail to PG_head as we switch the pfn to tail to - * head. - */ - *levelp = level = PT_DIRECTORY_LEVEL; - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } - } -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, kvm_pfn_t pfn, unsigned access, int *ret_val) { -- cgit v1.2.3 From 4cd071d13c5cc671826571d9876f76d001937a8b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:26 -0800 Subject: KVM: x86/mmu: Move calls to thp_adjust() down a level Move the calls to thp_adjust() down a level from the page fault handlers to the map/fetch helpers and remove the page count shuffling done in thp_adjust(). Despite holding a reference to the underlying page while processing a page fault, the page fault flows don't actually rely on holding a reference to the page when thp_adjust() is called. At that point, the fault handlers hold mmu_lock, which prevents mmu_notifier from completing any invalidations, and have verified no invalidations from mmu_notifier have occurred since the page reference was acquired (which is done prior to taking mmu_lock). The kvm_release_pfn_clean()/kvm_get_pfn() dance in thp_adjust() is a quirk that is necessitated because thp_adjust() modifies the pfn that is consumed by its caller. Because the page fault handlers call kvm_release_pfn_clean() on said pfn, thp_adjust() needs to transfer the reference to the correct pfn purely for correctness when the pfn is released. Calling thp_adjust() from __direct_map() and FNAME(fetch) means the pfn adjustment doesn't change the pfn as seen by the page fault handlers, i.e. the pfn released by the page fault handlers is the same pfn that was returned by gfn_to_pfn(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 31 ++++++++++++------------------- arch/x86/kvm/mmu/paging_tmpl.h | 11 ++++++----- 2 files changed, 18 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f8da3965c3e9..c4ed746416ba 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3345,24 +3345,15 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn))) { unsigned long mask; + /* - * mmu_notifier_retry was successful and we hold the - * mmu_lock here, so the pmd can't become splitting - * from under us, and in turn - * __split_huge_page_refcount() can't run from under - * us and we can safely transfer the refcount from - * PG_tail to PG_head as we switch the pfn to tail to - * head. + * mmu_notifier_retry() was successful and mmu_lock is held, so + * the pmd can't be split from under us. */ *levelp = level = PT_DIRECTORY_LEVEL; mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - kvm_get_pfn(pfn); - *pfnp = pfn; - } + *pfnp = pfn & ~mask; } } @@ -3390,8 +3381,9 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, - int map_writable, int level, kvm_pfn_t pfn, - bool prefault, bool account_disallowed_nx_lpage) + int map_writable, int level, int max_level, + kvm_pfn_t pfn, bool prefault, + bool account_disallowed_nx_lpage) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3402,6 +3394,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return RET_PF_RETRY; + if (likely(max_level > PT_PAGE_TABLE_LEVEL)) + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { /* @@ -4220,10 +4215,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (likely(max_level > PT_PAGE_TABLE_LEVEL)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault, - is_tdp && lpage_disallowed); + r = __direct_map(vcpu, gpa, write, map_writable, level, max_level, pfn, + prefault, is_tdp && lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7d57ec576df0..3b0ba2a77e28 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -613,7 +613,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct guest_walker *gw, - int write_fault, int hlevel, + int write_fault, int hlevel, int max_level, kvm_pfn_t pfn, bool map_writable, bool prefault, bool lpage_disallowed) { @@ -673,6 +673,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); base_gfn = gfn; + if (max_level > PT_PAGE_TABLE_LEVEL) + transparent_hugepage_adjust(vcpu, gw->gfn, &pfn, &hlevel); + trace_kvm_mmu_spte_requested(addr, gw->level, pfn); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { @@ -865,10 +868,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - if (max_level > PT_PAGE_TABLE_LEVEL) - transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, max_level, + pfn, map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: -- cgit v1.2.3 From ddce6208217c1aac22eec74461afb73e2af1fb06 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:27 -0800 Subject: KVM: x86/mmu: Move root_hpa validity checks to top of page fault handler Add a check on root_hpa at the beginning of the page fault handler to consolidate several checks on root_hpa that are scattered throughout the page fault code. This is a preparatory step towards eventually removing such checks altogether, or at the very least WARNing if an invalid root is encountered. Remove only the checks that can be easily audited to confirm that root_hpa cannot be invalidated between their current location and the new check in kvm_mmu_page_fault(), and aren't currently protected by mmu_lock, i.e. keep the checks in __direct_map() and FNAME(fetch) for the time being. The root_hpa checks that are consolidate were all added by commit 37f6a4e237303 ("KVM: x86: handle invalid root_hpa everywhere") which was a follow up to a bug fix for __direct_map(), commit 989c6b34f6a94 ("KVM: MMU: handle invalid root_hpa at __direct_map") At the time, nested VMX had, in hindsight, crazy handling of nested interrupts and would trigger a nested VM-Exit in ->interrupt_allowed(), and thus unexpectedly reset the MMU in flows such as can_do_async_pf(). Now that the wonky nested VM-Exit behavior is gone, the root_hpa checks are bogus and confusing, e.g. it's not at all obvious what they actually protect against, and at first glance they appear to be broken since many of them run without holding mmu_lock. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c4ed746416ba..bdba15ef88aa 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3565,9 +3565,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, u64 spte = 0ull; uint retry_count = 0; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return false; - if (!page_fault_can_be_fast(error_code)) return false; @@ -4011,9 +4008,6 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - goto exit; - walk_shadow_page_lockless_begin(vcpu); for (shadow_walk_init(&iterator, vcpu, addr), @@ -4043,7 +4037,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) root--; } } -exit: + *sptep = spte; return reserved; } @@ -4107,9 +4101,6 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) struct kvm_shadow_walk_iterator iterator; u64 spte; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { clear_sp_write_flooding_count(iterator.sptep); @@ -5472,6 +5463,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, int r, emulation_type = 0; bool direct = vcpu->arch.mmu->direct_map; + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + return RET_PF_RETRY; + /* With shadow page tables, fault_address contains a GVA or nGPA. */ if (vcpu->arch.mmu->direct_map) { vcpu->arch.gpa_available = true; -- cgit v1.2.3 From 0c7a98e34ddae6e45938f02d4ce7f04114ef0bdc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:28 -0800 Subject: KVM: x86/mmu: WARN on an invalid root_hpa WARN on the existing invalid root_hpa checks in __direct_map() and FNAME(fetch). The "legitimate" path that invalidated root_hpa in the middle of a page fault is long since gone, i.e. it should no longer be impossible to invalidate in the middle of a page fault[*]. The root_hpa checks were added by two related commits 989c6b34f6a94 ("KVM: MMU: handle invalid root_hpa at __direct_map") 37f6a4e237303 ("KVM: x86: handle invalid root_hpa everywhere") to fix a bug where nested_vmx_vmexit() could be called *in the middle* of a page fault. At the time, vmx_interrupt_allowed(), which was and still is used by kvm_can_do_async_pf() via ->interrupt_allowed(), directly invoked nested_vmx_vmexit() to switch from L2 to L1 to emulate a VM-Exit on a pending interrupt. Emulating the nested VM-Exit resulted in root_hpa being invalidated by kvm_mmu_reset_context() without explicitly terminating the page fault. Now that root_hpa is checked for validity by kvm_mmu_page_fault(), WARN on an invalid root_hpa to detect any flows that reset the MMU while handling a page fault. The broken vmx_interrupt_allowed() behavior has long since been fixed and resetting the MMU during a page fault should not be considered legal behavior. [*] It's actually technically possible in FNAME(page_fault)() because it calls inject_page_fault() when the guest translation is invalid, but in that case the page fault handling is immediately terminated. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index bdba15ef88aa..51f81f10f9f7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3391,7 +3391,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; if (likely(max_level > PT_PAGE_TABLE_LEVEL)) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 3b0ba2a77e28..b53bed3c901c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -637,7 +637,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) goto out_gpte_changed; for (shadow_walk_init(&it, vcpu, addr); -- cgit v1.2.3 From 6948199a9af969342a6faf257678616aea491fcf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 6 Dec 2019 15:57:29 -0800 Subject: KVM: x86/mmu: WARN if root_hpa is invalid when handling a page fault WARN if root_hpa is invalid when handling a page fault. The check on root_hpa exists for historical reasons that no longer apply to the current KVM code base. Remove an equivalent debug-only warning in direct_page_fault(), whose existence more or less confirms that root_hpa should always be valid when handling a page fault. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 51f81f10f9f7..7269130ea5e2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4172,8 +4172,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, kvm_pfn_t pfn; int level, r; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); - if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -5463,7 +5461,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, int r, emulation_type = 0; bool direct = vcpu->arch.mmu->direct_map; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) + if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; /* With shadow page tables, fault_address contains a GVA or nGPA. */ -- cgit v1.2.3 From 4de0a8355463e068e443b48eb5ae32370155368b Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 19 Nov 2019 14:27:40 +0800 Subject: KVM: PPC: Remove set but not used variable 'ra', 'rs', 'rt' Fixes gcc '-Wunused-but-set-variable' warning: arch/powerpc/kvm/emulate_loadstore.c: In function kvmppc_emulate_loadstore: arch/powerpc/kvm/emulate_loadstore.c:87:6: warning: variable ra set but not used [-Wunused-but-set-variable] arch/powerpc/kvm/emulate_loadstore.c: In function kvmppc_emulate_loadstore: arch/powerpc/kvm/emulate_loadstore.c:87:10: warning: variable rs set but not used [-Wunused-but-set-variable] arch/powerpc/kvm/emulate_loadstore.c: In function kvmppc_emulate_loadstore: arch/powerpc/kvm/emulate_loadstore.c:87:14: warning: variable rt set but not used [-Wunused-but-set-variable] They are not used since commit 2b33cb585f94 ("KVM: PPC: Reimplement LOAD_FP/STORE_FP instruction mmio emulation with analyse_instr() input") Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/emulate_loadstore.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 2e496eb86e94..1139bc56e004 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -73,7 +73,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 inst; - int ra, rs, rt; enum emulation_result emulated = EMULATE_FAIL; int advance = 1; struct instruction_op op; @@ -85,10 +84,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) if (emulated != EMULATE_DONE) return emulated; - ra = get_ra(inst); - rs = get_rs(inst); - rt = get_rt(inst); - vcpu->arch.mmio_vsx_copy_nums = 0; vcpu->arch.mmio_vsx_offset = 0; vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE; -- cgit v1.2.3 From 8a9c8925149f195d0bbd6b42aa3130ced0a075fb Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 26 Nov 2019 19:36:30 -0300 Subject: KVM: PPC: Book3S: Replace current->mm by kvm->mm Given that in kvm_create_vm() there is: kvm->mm = current->mm; And that on every kvm_*_ioctl we have: if (kvm->mm != current->mm) return -EIO; I see no reason to keep using current->mm instead of kvm->mm. By doing so, we would reduce the use of 'global' variables on code, relying more in the contents of kvm struct. Signed-off-by: Leonardo Bras Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++-- arch/powerpc/kvm/book3s_64_vio.c | 10 ++++++---- arch/powerpc/kvm/book3s_hv.c | 10 +++++----- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index d381526c5c9b..6c372f5c61b6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -284,7 +284,7 @@ static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, /* Protect linux PTE lookup from page table destruction */ rcu_read_lock_sched(); /* this disables preemption too */ ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, - current->mm->pgd, false, pte_idx_ret); + kvm->mm->pgd, false, pte_idx_ret); rcu_read_unlock_sched(); if (ret == H_TOO_HARD) { /* this can't happen */ @@ -573,7 +573,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, is_ci = false; pfn = 0; page = NULL; - mm = current->mm; + mm = kvm->mm; pte_size = PAGE_SIZE; writing = (dsisr & DSISR_ISSTORE) != 0; /* If writing != 0, then the HPTE must allow writing, if we get here */ diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 883a66e76638..ee6c103bb7d5 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -253,10 +253,11 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) } } + account_locked_vm(kvm->mm, + kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); + kvm_put_kvm(stt->kvm); - account_locked_vm(current->mm, - kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); call_rcu(&stt->rcu, release_spapr_tce_table); return 0; @@ -272,6 +273,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, { struct kvmppc_spapr_tce_table *stt = NULL; struct kvmppc_spapr_tce_table *siter; + struct mm_struct *mm = kvm->mm; unsigned long npages, size = args->size; int ret = -ENOMEM; @@ -280,7 +282,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EINVAL; npages = kvmppc_tce_pages(size); - ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true); + ret = account_locked_vm(mm, kvmppc_stt_pages(npages), true); if (ret) return ret; @@ -326,7 +328,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kfree(stt); fail_acct: - account_locked_vm(current->mm, kvmppc_stt_pages(npages), false); + account_locked_vm(mm, kvmppc_stt_pages(npages), false); return ret; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6ff3f896d908..2cf3dd8b79d2 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4285,7 +4285,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) user_vrsave = mfspr(SPRN_VRSAVE); vcpu->arch.wqp = &vcpu->arch.vcore->wq; - vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.pgdir = kvm->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { @@ -4640,14 +4640,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Look up the VMA for the start of this memory slot */ hva = memslot->userspace_addr; - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, hva); + down_read(&kvm->mm->mmap_sem); + vma = find_vma(kvm->mm, hva); if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) goto up_out; psize = vma_kernel_pagesize(vma); - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); /* We can handle 4k, 64k or 16M pages in the VRMA */ if (psize >= 0x1000000) @@ -4680,7 +4680,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) return err; up_out: - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); goto out_srcu; } -- cgit v1.2.3 From e1bd0a7e248c3ce59b0509e47f035c0759fc68a3 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 26 Nov 2019 19:36:31 -0300 Subject: KVM: PPC: Book3E: Replace current->mm by kvm->mm Given that in kvm_create_vm() there is: kvm->mm = current->mm; And that on every kvm_*_ioctl we have: if (kvm->mm != current->mm) return -EIO; I see no reason to keep using current->mm instead of kvm->mm. By doing so, we would reduce the use of 'global' variables on code, relying more in the contents of kvm struct. Signed-off-by: Leonardo Bras Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/booke.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index be9a45874194..fd7bdb4f8f87 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -775,7 +775,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) debug = current->thread.debug; current->thread.debug = vcpu->arch.dbg_reg; - vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.pgdir = vcpu->kvm->mm->pgd; kvmppc_fix_ee_before_entry(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); -- cgit v1.2.3 From ce477a7a1cdfc9aaafcfd03b45bde131a88d51de Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 19 Dec 2019 13:51:45 -0800 Subject: KVM: PPC: Add skip_page_out parameter to uvmem functions Add 'skip_page_out' parameter to kvmppc_uvmem_drop_pages() so the callers can specify whetheter or not to skip paging out pages. This will be needed in a follow-on patch that implements H_SVM_INIT_ABORT hcall. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s_uvmem.h | 4 ++-- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_uvmem.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h index 50204e228f16..3cf8425b9838 100644 --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h @@ -20,7 +20,7 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm); unsigned long kvmppc_h_svm_init_done(struct kvm *kvm); int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn); void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, - struct kvm *kvm); + struct kvm *kvm, bool skip_page_out); #else static inline int kvmppc_uvmem_init(void) { @@ -69,6 +69,6 @@ static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn) static inline void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, - struct kvm *kvm) { } + struct kvm *kvm, bool skip_page_out) { } #endif /* CONFIG_PPC_UV */ #endif /* __ASM_KVM_BOOK3S_UVMEM_H__ */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index da857c8ba6e4..744dba98e5d1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -1102,7 +1102,7 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm, unsigned int shift; if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START) - kvmppc_uvmem_drop_pages(memslot, kvm); + kvmppc_uvmem_drop_pages(memslot, kvm, true); if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2cf3dd8b79d2..47ffc7f1b104 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5477,7 +5477,7 @@ static int kvmhv_svm_off(struct kvm *kvm) continue; kvm_for_each_memslot(memslot, slots) { - kvmppc_uvmem_drop_pages(memslot, kvm); + kvmppc_uvmem_drop_pages(memslot, kvm, true); uv_unregister_mem_slot(kvm->arch.lpid, memslot->id); } } diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 2de264fc3156..ffa602a97dba 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -258,7 +258,7 @@ unsigned long kvmppc_h_svm_init_done(struct kvm *kvm) * QEMU page table with normal PTEs from newly allocated pages. */ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, - struct kvm *kvm) + struct kvm *kvm, bool skip_page_out) { int i; struct kvmppc_uvmem_page_pvt *pvt; @@ -276,7 +276,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, uvmem_page = pfn_to_page(uvmem_pfn); pvt = uvmem_page->zone_device_data; - pvt->skip_page_out = true; + pvt->skip_page_out = skip_page_out; mutex_unlock(&kvm->arch.uvmem_lock); pfn = gfn_to_pfn(kvm, gfn); -- cgit v1.2.3 From 3a43970d55e9fd5475d3c4e5fe398ab831ec6c3a Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 6 Jan 2020 18:02:37 -0800 Subject: KVM: PPC: Book3S HV: Implement H_SVM_INIT_ABORT hcall Implement the H_SVM_INIT_ABORT hcall which the Ultravisor can use to abort an SVM after it has issued the H_SVM_INIT_START and before the H_SVM_INIT_DONE hcalls. This hcall could be used when Ultravisor encounters security violations or other errors when starting an SVM. Note that this hcall is different from UV_SVM_TERMINATE ucall which is used by HV to terminate/cleanup an VM that has becore secure. The H_SVM_INIT_ABORT basically undoes operations that were done since the H_SVM_INIT_START hcall - i.e page-out all the VM pages back to normal memory, and terminate the SVM. (If we do not bring the pages back to normal memory, the text/data of the VM would be stuck in secure memory and since the SVM did not go secure, its MSR_S bit will be clear and the VM wont be able to access its pages even to do a clean exit). Based on patches and discussion with Paul Mackerras, Ram Pai and Bharata Rao. Signed-off-by: Ram Pai Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Bharata B Rao Signed-off-by: Paul Mackerras --- Documentation/powerpc/ultravisor.rst | 60 +++++++++++++++++++++++++++++ arch/powerpc/include/asm/hvcall.h | 1 + arch/powerpc/include/asm/kvm_book3s_uvmem.h | 6 +++ arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_hv.c | 3 ++ arch/powerpc/kvm/book3s_hv_uvmem.c | 28 ++++++++++++++ 6 files changed, 99 insertions(+) (limited to 'arch') diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst index 730854f73830..363736d7fd36 100644 --- a/Documentation/powerpc/ultravisor.rst +++ b/Documentation/powerpc/ultravisor.rst @@ -948,6 +948,66 @@ Use cases up its internal state for this virtual machine. +H_SVM_INIT_ABORT +---------------- + + Abort the process of securing an SVM. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t hypercall(const uint64_t H_SVM_INIT_ABORT) + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * H_PARAMETER on successfully cleaning up the state, + Hypervisor will return this value to the + **guest**, to indicate that the underlying + UV_ESM ultracall failed. + + * H_STATE if called after a VM has gone secure (i.e + H_SVM_INIT_DONE hypercall was successful). + + * H_UNSUPPORTED if called from a wrong context (e.g. from a + normal VM). + +Description +~~~~~~~~~~~ + + Abort the process of securing a virtual machine. This call must + be made after a prior call to ``H_SVM_INIT_START`` hypercall and + before a call to ``H_SVM_INIT_DONE``. + + On entry into this hypercall the non-volatile GPRs and FPRs are + expected to contain the values they had at the time the VM issued + the UV_ESM ultracall. Further ``SRR0`` is expected to contain the + address of the instruction after the ``UV_ESM`` ultracall and ``SRR1`` + the MSR value with which to return to the VM. + + This hypercall will cleanup any partial state that was established for + the VM since the prior ``H_SVM_INIT_START`` hypercall, including paging + out pages that were paged-into secure memory, and issue the + ``UV_SVM_TERMINATE`` ultracall to terminate the VM. + + After the partial state is cleaned up, control returns to the VM + (**not Ultravisor**), at the address specified in ``SRR0`` with the + MSR values set to the value in ``SRR1``. + +Use cases +~~~~~~~~~ + + If after a successful call to ``H_SVM_INIT_START``, the Ultravisor + encounters an error while securing a virtual machine, either due + to lack of resources or because the VM's security information could + not be validated, Ultravisor informs the Hypervisor about it. + Hypervisor should use this call to clean up any internal state for + this virtual machine and return to the VM. + H_SVM_PAGE_IN ------------- diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 13bd870609c3..e90c073e437e 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -350,6 +350,7 @@ #define H_SVM_PAGE_OUT 0xEF04 #define H_SVM_INIT_START 0xEF08 #define H_SVM_INIT_DONE 0xEF0C +#define H_SVM_INIT_ABORT 0xEF14 /* Values for 2nd argument to H_SET_MODE */ #define H_SET_MODE_RESOURCE_SET_CIABR 1 diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h index 3cf8425b9838..5a9834e0e2d1 100644 --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h @@ -19,6 +19,7 @@ unsigned long kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long kvmppc_h_svm_init_start(struct kvm *kvm); unsigned long kvmppc_h_svm_init_done(struct kvm *kvm); int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn); +unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm); void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, struct kvm *kvm, bool skip_page_out); #else @@ -62,6 +63,11 @@ static inline unsigned long kvmppc_h_svm_init_done(struct kvm *kvm) return H_UNSUPPORTED; } +static inline unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm) +{ + return H_UNSUPPORTED; +} + static inline int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn) { return -EFAULT; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0a398f2321c2..6e8b8ffd06ad 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -278,6 +278,7 @@ struct kvm_resize_hpt; /* Flag values for kvm_arch.secure_guest */ #define KVMPPC_SECURE_INIT_START 0x1 /* H_SVM_INIT_START has been called */ #define KVMPPC_SECURE_INIT_DONE 0x2 /* H_SVM_INIT_DONE completed */ +#define KVMPPC_SECURE_INIT_ABORT 0x4 /* H_SVM_INIT_ABORT issued */ struct kvm_arch { unsigned int lpid; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 47ffc7f1b104..1118cff7f7ef 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1091,6 +1091,9 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) case H_SVM_INIT_DONE: ret = kvmppc_h_svm_init_done(vcpu->kvm); break; + case H_SVM_INIT_ABORT: + ret = kvmppc_h_svm_init_abort(vcpu->kvm); + break; default: return RESUME_HOST; diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index ffa602a97dba..4d1f25a3959a 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -286,6 +286,34 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, } } +unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm) +{ + int srcu_idx; + struct kvm_memory_slot *memslot; + + /* + * Expect to be called only after INIT_START and before INIT_DONE. + * If INIT_DONE was completed, use normal VM termination sequence. + */ + if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) + return H_UNSUPPORTED; + + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return H_STATE; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) + kvmppc_uvmem_drop_pages(memslot, kvm, false); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + kvm->arch.secure_guest = 0; + uv_svm_terminate(kvm->arch.lpid); + + return H_PARAMETER; +} + /* * Get a free device PFN from the pool * -- cgit v1.2.3 From b6ae256afd32f96bec0117175b329d0dd617655e Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 12 Dec 2019 20:50:55 +0100 Subject: KVM: arm64: Only sign-extend MMIO up to register width On AArch64 you can do a sign-extended load to either a 32-bit or 64-bit register, and we should only sign extend the register up to the width of the register as specified in the operation (by using the 32-bit Wn or 64-bit Xn register specifier). As it turns out, the architecture provides this decoding information in the SF ("Sixty-Four" -- how cute...) bit. Let's take advantage of this with the usual 32-bit/64-bit header file dance and do the right thing on AArch64 hosts. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20191212195055.5541-1-christoffer.dall@arm.com --- arch/arm/include/asm/kvm_emulate.h | 5 +++++ arch/arm/include/asm/kvm_mmio.h | 2 ++ arch/arm64/include/asm/kvm_emulate.h | 5 +++++ arch/arm64/include/asm/kvm_mmio.h | 6 ++---- virt/kvm/arm/mmio.c | 6 ++++++ 5 files changed, 20 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 9b118516d2db..fe55d8737a11 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -182,6 +182,11 @@ static inline bool kvm_vcpu_dabt_issext(struct kvm_vcpu *vcpu) return kvm_vcpu_get_hsr(vcpu) & HSR_SSE; } +static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu) +{ + return false; +} + static inline int kvm_vcpu_dabt_get_rd(struct kvm_vcpu *vcpu) { return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT; diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h index 7c0eddb0adb2..32fbf82e3ebc 100644 --- a/arch/arm/include/asm/kvm_mmio.h +++ b/arch/arm/include/asm/kvm_mmio.h @@ -14,6 +14,8 @@ struct kvm_decode { unsigned long rt; bool sign_extend; + /* Not used on 32-bit arm */ + bool sixty_four; }; void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 5efe5ca8fecf..f407b6bdad2e 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -283,6 +283,11 @@ static inline bool kvm_vcpu_dabt_issext(const struct kvm_vcpu *vcpu) return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SSE); } +static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu) +{ + return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SF); +} + static inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu) { return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT; diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h index 02b5c48fd467..b204501a0c39 100644 --- a/arch/arm64/include/asm/kvm_mmio.h +++ b/arch/arm64/include/asm/kvm_mmio.h @@ -10,13 +10,11 @@ #include #include -/* - * This is annoying. The mmio code requires this, even if we don't - * need any decoding. To be fixed. - */ struct kvm_decode { unsigned long rt; bool sign_extend; + /* Witdth of the register accessed by the faulting instruction is 64-bits */ + bool sixty_four; }; void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index 70d3b449692c..1bb71acd53f2 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -105,6 +105,9 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) data = (data ^ mask) - mask; } + if (!vcpu->arch.mmio_decode.sixty_four) + data = data & 0xffffffff; + trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, &data); data = vcpu_data_host_to_guest(vcpu, data, len); @@ -125,6 +128,7 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) unsigned long rt; int access_size; bool sign_extend; + bool sixty_four; if (kvm_vcpu_dabt_iss1tw(vcpu)) { /* page table accesses IO mem: tell guest to fix its TTBR */ @@ -138,11 +142,13 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) *is_write = kvm_vcpu_dabt_iswrite(vcpu); sign_extend = kvm_vcpu_dabt_issext(vcpu); + sixty_four = kvm_vcpu_dabt_issf(vcpu); rt = kvm_vcpu_dabt_get_rd(vcpu); *len = access_size; vcpu->arch.mmio_decode.sign_extend = sign_extend; vcpu->arch.mmio_decode.rt = rt; + vcpu->arch.mmio_decode.sixty_four = sixty_four; return 0; } -- cgit v1.2.3 From f5523423defb0d929e23813c8dd16c0131043a8c Mon Sep 17 00:00:00 2001 From: Russell King Date: Sat, 28 Dec 2019 11:57:14 +0000 Subject: arm64: kvm: Fix IDMAP overlap with HYP VA Booting 5.4 on LX2160A reveals that KVM is non-functional: kvm: Limiting the IPA size due to kernel Virtual Address limit kvm [1]: IPA Size Limit: 43bits kvm [1]: IDMAP intersecting with HYP VA, unable to continue kvm [1]: error initializing Hyp mode: -22 Debugging shows: kvm [1]: IDMAP page: 81a26000 kvm [1]: HYP VA range: 0:22ffffffff as RAM is located at: 80000000-fbdfffff : System RAM 2080000000-237fffffff : System RAM Comparing this with the same kernel on Armada 8040 shows: kvm: Limiting the IPA size due to kernel Virtual Address limit kvm [1]: IPA Size Limit: 43bits kvm [1]: IDMAP page: 2a26000 kvm [1]: HYP VA range: 4800000000:493fffffff ... kvm [1]: Hyp mode initialized successfully which indicates that hyp_va_msb is set, and is always set to the opposite value of the idmap page to avoid the overlap. This does not happen with the LX2160A. Further debugging shows vabits_actual = 39, kva_msb = 38 on LX2160A and kva_msb = 33 on Armada 8040. Looking at the bit layout of the HYP VA, there is still one bit available for hyp_va_msb. Set this bit appropriately. This allows KVM to be functional on the LX2160A, but without any HYP VA randomisation: kvm: Limiting the IPA size due to kernel Virtual Address limit kvm [1]: IPA Size Limit: 43bits kvm [1]: IDMAP page: 81a24000 kvm [1]: HYP VA range: 4000000000:62ffffffff ... kvm [1]: Hyp mode initialized successfully Fixes: ed57cac83e05 ("arm64: KVM: Introduce EL2 VA randomisation") Signed-off-by: Russell King [maz: small additional cleanups, preserved case where the tag is legitimately 0 and we can just use the mask, Fixes tag] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/E1ilAiY-0000MA-RG@rmk-PC.armlinux.org.uk --- arch/arm64/kvm/va_layout.c | 56 +++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index dab1fea4752a..a4f48c1ac28c 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -13,52 +13,46 @@ #include /* - * The LSB of the random hyp VA tag or 0 if no randomization is used. + * The LSB of the HYP VA tag */ static u8 tag_lsb; /* - * The random hyp VA tag value with the region bit if hyp randomization is used + * The HYP VA tag value with the region bit */ static u64 tag_val; static u64 va_mask; +/* + * We want to generate a hyp VA with the following format (with V == + * vabits_actual): + * + * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0 + * --------------------------------------------------------- + * | 0000000 | hyp_va_msb | random tag | kern linear VA | + * |--------- tag_val -----------|----- va_mask ---| + * + * which does not conflict with the idmap regions. + */ __init void kvm_compute_layout(void) { phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start); u64 hyp_va_msb; - int kva_msb; /* Where is my RAM region? */ hyp_va_msb = idmap_addr & BIT(vabits_actual - 1); hyp_va_msb ^= BIT(vabits_actual - 1); - kva_msb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^ + tag_lsb = fls64((u64)phys_to_virt(memblock_start_of_DRAM()) ^ (u64)(high_memory - 1)); - if (kva_msb == (vabits_actual - 1)) { - /* - * No space in the address, let's compute the mask so - * that it covers (vabits_actual - 1) bits, and the region - * bit. The tag stays set to zero. - */ - va_mask = BIT(vabits_actual - 1) - 1; - va_mask |= hyp_va_msb; - } else { - /* - * We do have some free bits to insert a random tag. - * Hyp VAs are now created from kernel linear map VAs - * using the following formula (with V == vabits_actual): - * - * 63 ... V | V-1 | V-2 .. tag_lsb | tag_lsb - 1 .. 0 - * --------------------------------------------------------- - * | 0000000 | hyp_va_msb | random tag | kern linear VA | - */ - tag_lsb = kva_msb; - va_mask = GENMASK_ULL(tag_lsb - 1, 0); - tag_val = get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb); - tag_val |= hyp_va_msb; - tag_val >>= tag_lsb; + va_mask = GENMASK_ULL(tag_lsb - 1, 0); + tag_val = hyp_va_msb; + + if (tag_lsb != (vabits_actual - 1)) { + /* We have some free bits to insert a random tag. */ + tag_val |= get_random_long() & GENMASK_ULL(vabits_actual - 2, tag_lsb); } + tag_val >>= tag_lsb; } static u32 compute_instruction(int n, u32 rd, u32 rn) @@ -117,11 +111,11 @@ void __init kvm_update_va_mask(struct alt_instr *alt, * VHE doesn't need any address translation, let's NOP * everything. * - * Alternatively, if we don't have any spare bits in - * the address, NOP everything after masking that - * kernel VA. + * Alternatively, if the tag is zero (because the layout + * dictates it and we don't have any spare bits in the + * address), NOP everything after masking the kernel VA. */ - if (has_vhe() || (!tag_lsb && i > 0)) { + if (has_vhe() || (!tag_val && i > 0)) { updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); continue; } -- cgit v1.2.3 From a425372e733177eb0779748956bc16c85167af48 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 8 Jan 2020 13:43:22 +0000 Subject: KVM: arm64: Correct PSTATE on exception entry When KVM injects an exception into a guest, it generates the PSTATE value from scratch, configuring PSTATE.{M[4:0],DAIF}, and setting all other bits to zero. This isn't correct, as the architecture specifies that some PSTATE bits are (conditionally) cleared or set upon an exception, and others are unchanged from the original context. This patch adds logic to match the architectural behaviour. To make this simple to follow/audit/extend, documentation references are provided, and bits are configured in order of their layout in SPSR_EL2. This layout can be seen in the diagram on ARM DDI 0487E.a page C5-429. Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200108134324.46500-2-mark.rutland@arm.com --- arch/arm64/include/uapi/asm/ptrace.h | 1 + arch/arm64/kvm/inject_fault.c | 70 +++++++++++++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 7ed9294e2004..d1bb5b69f1ce 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -49,6 +49,7 @@ #define PSR_SSBS_BIT 0x00001000 #define PSR_PAN_BIT 0x00400000 #define PSR_UAO_BIT 0x00800000 +#define PSR_DIT_BIT 0x01000000 #define PSR_V_BIT 0x10000000 #define PSR_C_BIT 0x20000000 #define PSR_Z_BIT 0x40000000 diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index ccdb6a051ab2..6aafc2825c1c 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -14,9 +14,6 @@ #include #include -#define PSTATE_FAULT_BITS_64 (PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | \ - PSR_I_BIT | PSR_D_BIT) - #define CURRENT_EL_SP_EL0_VECTOR 0x0 #define CURRENT_EL_SP_ELx_VECTOR 0x200 #define LOWER_EL_AArch64_VECTOR 0x400 @@ -50,6 +47,69 @@ static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type) return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type; } +/* + * When an exception is taken, most PSTATE fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all + * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx + * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0. + * + * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429. + * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426. + * + * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from + * MSB to LSB. + */ +static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu) +{ + unsigned long sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); + unsigned long old, new; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_N_BIT); + new |= (old & PSR_Z_BIT); + new |= (old & PSR_C_BIT); + new |= (old & PSR_V_BIT); + + // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests) + + new |= (old & PSR_DIT_BIT); + + // PSTATE.UAO is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D5-2579. + + // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0 + // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page D5-2578. + new |= (old & PSR_PAN_BIT); + if (!(sctlr & SCTLR_EL1_SPAN)) + new |= PSR_PAN_BIT; + + // PSTATE.SS is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D2-2452. + + // PSTATE.IL is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, page D1-2306. + + // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64 + // See ARM DDI 0487E.a, page D13-3258 + if (sctlr & SCTLR_ELx_DSSBS) + new |= PSR_SSBS_BIT; + + // PSTATE.BTYPE is set to zero upon any exception to AArch64 + // See ARM DDI 0487E.a, pages D1-2293 to D1-2294. + + new |= PSR_D_BIT; + new |= PSR_A_BIT; + new |= PSR_I_BIT; + new |= PSR_F_BIT; + + new |= PSR_MODE_EL1h; + + return new; +} + static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr) { unsigned long cpsr = *vcpu_cpsr(vcpu); @@ -59,7 +119,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); - *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; + *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); vcpu_write_spsr(vcpu, cpsr); vcpu_write_sys_reg(vcpu, addr, FAR_EL1); @@ -94,7 +154,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu)); *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync); - *vcpu_cpsr(vcpu) = PSTATE_FAULT_BITS_64; + *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu); vcpu_write_spsr(vcpu, cpsr); /* -- cgit v1.2.3 From 3c2483f15499b877ccb53250d88addb8c91da147 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 8 Jan 2020 13:43:23 +0000 Subject: KVM: arm/arm64: Correct CPSR on exception entry When KVM injects an exception into a guest, it generates the CPSR value from scratch, configuring CPSR.{M,A,I,T,E}, and setting all other bits to zero. This isn't correct, as the architecture specifies that some CPSR bits are (conditionally) cleared or set upon an exception, and others are unchanged from the original context. This patch adds logic to match the architectural behaviour. To make this simple to follow/audit/extend, documentation references are provided, and bits are configured in order of their layout in SPSR_EL2. This layout can be seen in the diagram on ARM DDI 0487E.a page C5-426. Note that this code is used by both arm and arm64, and is intended to fuction with the SPSR_EL2 and SPSR_HYP layouts. Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200108134324.46500-3-mark.rutland@arm.com --- arch/arm/include/asm/kvm_emulate.h | 12 ++++ arch/arm64/include/asm/ptrace.h | 1 + virt/kvm/arm/aarch32.c | 111 +++++++++++++++++++++++++++++++++---- 3 files changed, 114 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index fe55d8737a11..c488c629e6c8 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -14,13 +14,25 @@ #include /* arm64 compatibility macros */ +#define PSR_AA32_MODE_FIQ FIQ_MODE +#define PSR_AA32_MODE_SVC SVC_MODE #define PSR_AA32_MODE_ABT ABT_MODE #define PSR_AA32_MODE_UND UND_MODE #define PSR_AA32_T_BIT PSR_T_BIT +#define PSR_AA32_F_BIT PSR_F_BIT #define PSR_AA32_I_BIT PSR_I_BIT #define PSR_AA32_A_BIT PSR_A_BIT #define PSR_AA32_E_BIT PSR_E_BIT #define PSR_AA32_IT_MASK PSR_IT_MASK +#define PSR_AA32_GE_MASK 0x000f0000 +#define PSR_AA32_DIT_BIT 0x00200000 +#define PSR_AA32_PAN_BIT 0x00400000 +#define PSR_AA32_SSBS_BIT 0x00800000 +#define PSR_AA32_Q_BIT PSR_Q_BIT +#define PSR_AA32_V_BIT PSR_V_BIT +#define PSR_AA32_C_BIT PSR_C_BIT +#define PSR_AA32_Z_BIT PSR_Z_BIT +#define PSR_AA32_N_BIT PSR_N_BIT unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index fbebb411ae20..bf57308fcd63 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -62,6 +62,7 @@ #define PSR_AA32_I_BIT 0x00000080 #define PSR_AA32_A_BIT 0x00000100 #define PSR_AA32_E_BIT 0x00000200 +#define PSR_AA32_PAN_BIT 0x00400000 #define PSR_AA32_SSBS_BIT 0x00800000 #define PSR_AA32_DIT_BIT 0x01000000 #define PSR_AA32_Q_BIT 0x08000000 diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c index c4c57ba99e90..773cf1439081 100644 --- a/virt/kvm/arm/aarch32.c +++ b/virt/kvm/arm/aarch32.c @@ -10,6 +10,7 @@ * Author: Christoffer Dall */ +#include #include #include #include @@ -28,22 +29,112 @@ static const u8 return_offsets[8][2] = { [7] = { 4, 4 }, /* FIQ, unused */ }; +/* + * When an exception is taken, most CPSR fields are left unchanged in the + * handler. However, some are explicitly overridden (e.g. M[4:0]). + * + * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with + * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was + * obsoleted by the ARMv7 virtualization extensions and is RES0. + * + * For the SPSR layout seen from AArch32, see: + * - ARM DDI 0406C.d, page B1-1148 + * - ARM DDI 0487E.a, page G8-6264 + * + * For the SPSR_ELx layout for AArch32 seen from AArch64, see: + * - ARM DDI 0487E.a, page C5-426 + * + * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from + * MSB to LSB. + */ +static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) +{ + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); + unsigned long old, new; + + old = *vcpu_cpsr(vcpu); + new = 0; + + new |= (old & PSR_AA32_N_BIT); + new |= (old & PSR_AA32_Z_BIT); + new |= (old & PSR_AA32_C_BIT); + new |= (old & PSR_AA32_V_BIT); + new |= (old & PSR_AA32_Q_BIT); + + // CPSR.IT[7:0] are set to zero upon any exception + // See ARM DDI 0487E.a, section G1.12.3 + // See ARM DDI 0406C.d, section B1.8.3 + + new |= (old & PSR_AA32_DIT_BIT); + + // CPSR.SSBS is set to SCTLR.DSSBS upon any exception + // See ARM DDI 0487E.a, page G8-6244 + if (sctlr & BIT(31)) + new |= PSR_AA32_SSBS_BIT; + + // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0 + // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented + // See ARM DDI 0487E.a, page G8-6246 + new |= (old & PSR_AA32_PAN_BIT); + if (!(sctlr & BIT(23))) + new |= PSR_AA32_PAN_BIT; + + // SS does not exist in AArch32, so ignore + + // CPSR.IL is set to zero upon any exception + // See ARM DDI 0487E.a, page G1-5527 + + new |= (old & PSR_AA32_GE_MASK); + + // CPSR.IT[7:0] are set to zero upon any exception + // See prior comment above + + // CPSR.E is set to SCTLR.EE upon any exception + // See ARM DDI 0487E.a, page G8-6245 + // See ARM DDI 0406C.d, page B4-1701 + if (sctlr & BIT(25)) + new |= PSR_AA32_E_BIT; + + // CPSR.A is unchanged upon an exception to Undefined, Supervisor + // CPSR.A is set upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_A_BIT); + if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC) + new |= PSR_AA32_A_BIT; + + // CPSR.I is set upon any exception + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= PSR_AA32_I_BIT; + + // CPSR.F is set upon an exception to FIQ + // CPSR.F is unchanged upon an exception to other modes + // See ARM DDI 0487E.a, pages G1-5515 to G1-5516 + // See ARM DDI 0406C.d, page B1-1182 + new |= (old & PSR_AA32_F_BIT); + if (mode == PSR_AA32_MODE_FIQ) + new |= PSR_AA32_F_BIT; + + // CPSR.T is set to SCTLR.TE upon any exception + // See ARM DDI 0487E.a, page G8-5514 + // See ARM DDI 0406C.d, page B1-1181 + if (sctlr & BIT(30)) + new |= PSR_AA32_T_BIT; + + new |= mode; + + return new; +} + static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) { - unsigned long cpsr; unsigned long new_spsr_value = *vcpu_cpsr(vcpu); bool is_thumb = (new_spsr_value & PSR_AA32_T_BIT); u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); - cpsr = mode | PSR_AA32_I_BIT; - - if (sctlr & (1 << 30)) - cpsr |= PSR_AA32_T_BIT; - if (sctlr & (1 << 25)) - cpsr |= PSR_AA32_E_BIT; - - *vcpu_cpsr(vcpu) = cpsr; + *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); /* Note: These now point to the banked copies */ vcpu_write_spsr(vcpu, new_spsr_value); @@ -84,7 +175,7 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, fsr = &vcpu_cp15(vcpu, c5_DFSR); } - prepare_fault32(vcpu, PSR_AA32_MODE_ABT | PSR_AA32_A_BIT, vect_offset); + prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset); *far = addr; -- cgit v1.2.3 From 1cfbb484de158e378e8971ac40f3082e53ecca55 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 8 Jan 2020 13:43:24 +0000 Subject: KVM: arm/arm64: Correct AArch32 SPSR on exception entry Confusingly, there are three SPSR layouts that a kernel may need to deal with: (1) An AArch64 SPSR_ELx view of an AArch64 pstate (2) An AArch64 SPSR_ELx view of an AArch32 pstate (3) An AArch32 SPSR_* view of an AArch32 pstate When the KVM AArch32 support code deals with SPSR_{EL2,HYP}, it's either dealing with #2 or #3 consistently. On arm64 the PSR_AA32_* definitions match the AArch64 SPSR_ELx view, and on arm the PSR_AA32_* definitions match the AArch32 SPSR_* view. However, when we inject an exception into an AArch32 guest, we have to synthesize the AArch32 SPSR_* that the guest will see. Thus, an AArch64 host needs to synthesize layout #3 from layout #2. This patch adds a new host_spsr_to_spsr32() helper for this, and makes use of it in the KVM AArch32 support code. For arm64 we need to shuffle the DIT bit around, and remove the SS bit, while for arm we can use the value as-is. I've open-coded the bit manipulation for now to avoid having to rework the existing PSR_* definitions into PSR64_AA32_* and PSR32_AA32_* definitions. I hope to perform a more thorough refactoring in future so that we can handle pstate view manipulation more consistently across the kernel tree. Signed-off-by: Mark Rutland Signed-off-by: Marc Zyngier Reviewed-by: Alexandru Elisei Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200108134324.46500-4-mark.rutland@arm.com --- arch/arm/include/asm/kvm_emulate.h | 5 +++++ arch/arm64/include/asm/kvm_emulate.h | 32 ++++++++++++++++++++++++++++++++ virt/kvm/arm/aarch32.c | 6 +++--- 3 files changed, 40 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index c488c629e6c8..08d9805f613b 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -53,6 +53,11 @@ static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v) *__vcpu_spsr(vcpu) = v; } +static inline unsigned long host_spsr_to_spsr32(unsigned long spsr) +{ + return spsr; +} + static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu, u8 reg_num) { diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index f407b6bdad2e..53ea7637b7b2 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -219,6 +219,38 @@ static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v) vcpu_gp_regs(vcpu)->spsr[KVM_SPSR_EL1] = v; } +/* + * The layout of SPSR for an AArch32 state is different when observed from an + * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32 + * view given an AArch64 view. + * + * In ARM DDI 0487E.a see: + * + * - The AArch64 view (SPSR_EL2) in section C5.2.18, page C5-426 + * - The AArch32 view (SPSR_abt) in section G8.2.126, page G8-6256 + * - The AArch32 view (SPSR_und) in section G8.2.132, page G8-6280 + * + * Which show the following differences: + * + * | Bit | AA64 | AA32 | Notes | + * +-----+------+------+-----------------------------| + * | 24 | DIT | J | J is RES0 in ARMv8 | + * | 21 | SS | DIT | SS doesn't exist in AArch32 | + * + * ... and all other bits are (currently) common. + */ +static inline unsigned long host_spsr_to_spsr32(unsigned long spsr) +{ + const unsigned long overlap = BIT(24) | BIT(21); + unsigned long dit = !!(spsr & PSR_AA32_DIT_BIT); + + spsr &= ~overlap; + + spsr |= dit << 21; + + return spsr; +} + static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu) { u32 mode; diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c index 773cf1439081..631d397ac81b 100644 --- a/virt/kvm/arm/aarch32.c +++ b/virt/kvm/arm/aarch32.c @@ -129,15 +129,15 @@ static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode) static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) { - unsigned long new_spsr_value = *vcpu_cpsr(vcpu); - bool is_thumb = (new_spsr_value & PSR_AA32_T_BIT); + unsigned long spsr = *vcpu_cpsr(vcpu); + bool is_thumb = (spsr & PSR_AA32_T_BIT); u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode); /* Note: These now point to the banked copies */ - vcpu_write_spsr(vcpu, new_spsr_value); + vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr)); *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; /* Branch to exception vector */ -- cgit v1.2.3 From 1e9e2622a149e88bd636c9f8fb346a6e6aefeae0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 21 Nov 2019 11:17:11 +0800 Subject: KVM: VMX: FIXED+PHYSICAL mode single target IPI fastpath ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in our product observation, multicast IPIs are not as common as unicast IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc. This patch introduce a mechanism to handle certain performance-critical WRMSRs in a very early stage of KVM VMExit handler. This mechanism is specifically used for accelerating writes to x2APIC ICR that attempt to send a virtual IPI with physical destination-mode, fixed delivery-mode and single target. Which was found as one of the main causes of VMExits for Linux workloads. The reason this mechanism significantly reduce the latency of such virtual IPIs is by sending the physical IPI to the target vCPU in a very early stage of KVM VMExit handler, before host interrupts are enabled and before expensive operations such as reacquiring KVM’s SRCU lock. Latency is reduced even more when KVM is able to use APICv posted-interrupt mechanism (which allows to deliver the virtual IPI directly to target vCPU without the need to kick it to host). Testing on Xeon Skylake server: The virtual IPI latency from sender send to receiver receive reduces more than 200+ cpu cycles. Reviewed-by: Liran Alon Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Sean Christopherson Cc: Vitaly Kuznetsov Cc: Liran Alon Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 11 ++++++++-- arch/x86/kvm/svm.c | 15 +++++++++---- arch/x86/kvm/vmx/vmx.c | 14 +++++++++--- arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/x86.h | 1 + 5 files changed, 78 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 159a28512e4c..e2b793477243 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -175,6 +175,11 @@ enum { VCPU_SREG_LDTR, }; +enum exit_fastpath_completion { + EXIT_FASTPATH_NONE, + EXIT_FASTPATH_SKIP_EMUL_INS, +}; + #include #define KVM_NR_MEM_OBJS 40 @@ -1095,7 +1100,8 @@ struct kvm_x86_ops { void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); void (*run)(struct kvm_vcpu *vcpu); - int (*handle_exit)(struct kvm_vcpu *vcpu); + int (*handle_exit)(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); @@ -1145,7 +1151,8 @@ struct kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage); - void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion *exit_fastpath); bool (*mpx_supported)(void); bool (*xsaves_supported)(void); bool (*umip_emulated)(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8f1b715dfde8..9583ae7ae218 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4935,7 +4935,8 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = control->exit_info_2; } -static int handle_exit(struct kvm_vcpu *vcpu) +static int handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -4993,7 +4994,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) __func__, svm->vmcb->control.exit_int_info, exit_code); - if (exit_code >= ARRAY_SIZE(svm_exit_handlers) + if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { + kvm_skip_emulated_instruction(vcpu); + return 1; + } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); dump_vmcb(vcpu); @@ -6186,9 +6190,12 @@ out: return ret; } -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion *exit_fastpath) { - + if (!is_guest_mode(vcpu) && + to_svm(vcpu)->vmcb->control.exit_code == EXIT_REASON_MSR_WRITE) + *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b5a0c2e05825..48a3af8fac0f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5814,7 +5814,8 @@ void dump_vmcs(void) * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; @@ -5900,7 +5901,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } } - if (exit_reason < kvm_vmx_max_exit_handlers + if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { + kvm_skip_emulated_instruction(vcpu); + return 1; + } else if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) { #ifdef CONFIG_RETPOLINE if (exit_reason == EXIT_REASON_MSR_WRITE) @@ -6248,7 +6252,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion *exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6256,6 +6261,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); + else if (!is_guest_mode(vcpu) && + vmx->exit_reason == EXIT_REASON_MSR_WRITE) + *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static bool vmx_has_emulated_msr(int index) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 93bbbce67a03..33e165c0351d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1525,6 +1525,49 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +/* + * The fast path for frequent and performance sensitive wrmsr emulation, + * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces + * the latency of virtual IPI by avoiding the expensive bits of transitioning + * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the + * other cases which must be called after interrupts are enabled on the host. + */ +static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) +{ + if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) && + ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && + ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { + + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); + return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); + } + + return 1; +} + +enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +{ + u32 msr = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); + int ret = 0; + + switch (msr) { + case APIC_BASE_MSR + (APIC_ICR >> 4): + ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + break; + default: + return EXIT_FASTPATH_NONE; + } + + if (!ret) { + trace_kvm_msr_write(msr, data); + return EXIT_FASTPATH_SKIP_EMUL_INS; + } + + return EXIT_FASTPATH_NONE; +} +EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); + /* * Adapt set_msr() to msr_io()'s calling convention */ @@ -7995,6 +8038,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); + enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE; bool req_immediate_exit = false; @@ -8241,7 +8285,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops->handle_exit_irqoff(vcpu); + kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath); /* * Consume any pending interrupts, including the possible source of @@ -8285,7 +8329,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_lapic_sync_from_vapic(vcpu); vcpu->arch.gpa_available = false; - r = kvm_x86_ops->handle_exit(vcpu); + r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); return r; cancel_injection: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cab5e71f0f0f..9805cf2c6b35 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -291,6 +291,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, bool kvm_vector_hashing_enabled(void); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); +enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ -- cgit v1.2.3 From dfd146fcae8974d40ef6dcfc047d7a1631e064d9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 21 Nov 2019 11:17:12 +0800 Subject: KVM: LAPIC: micro-optimize fixed mode ipi delivery This patch optimizes redundancy logic before fixed mode ipi is delivered in the fast path, broadcast handling needs to go slow path, so the delivery mode repair can be delayed to before slow path. Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 9d711c2451c7..79afa0bb5f41 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -52,15 +52,15 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; + if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) + return r; + if (irq->dest_mode == APIC_DEST_PHYSICAL && irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } - if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) - return r; - memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); kvm_for_each_vcpu(i, vcpu, kvm) { -- cgit v1.2.3 From cad23e72b7742578fad2e4ec8856d376ec8db923 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 7 Dec 2019 17:25:22 +0800 Subject: KVM: x86: check kvm_pit outside kvm_vm_ioctl_reinject() check kvm_pit outside kvm_vm_ioctl_reinject() to keep codestyle consistent with other kvm_pit func and prepare for futher cleanups. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 33e165c0351d..e62d2688bfec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4703,9 +4703,6 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, { struct kvm_pit *pit = kvm->arch.vpit; - if (!pit) - return -ENXIO; - /* pit->pit_state.lock was overloaded to prevent userspace from getting * an inconsistent state after running multiple KVM_REINJECT_CONTROL * ioctls in parallel. Use a separate lock if that ioctl isn't rare. @@ -5072,6 +5069,9 @@ set_identity_unlock: r = -EFAULT; if (copy_from_user(&control, argp, sizeof(control))) goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; r = kvm_vm_ioctl_reinject(kvm, &control); break; } -- cgit v1.2.3 From 668effb63de8962e931196e4ebeae8387bfe6d3c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:20 +0800 Subject: KVM: Fix some wrong function names in comment Fix some wrong function names in comment. mmu_check_roots is a typo for mmu_check_root, vmcs_read_any should be vmcs12_read_any and so on. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs_shadow_fields.h | 2 +- virt/kvm/kvm_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index eb1ecd16fd22..5f3d95c18c39 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -28,7 +28,7 @@ BUILD_BUG_ON(1) /* * Keeping the fields ordered by size is an attempt at improving - * branch prediction in vmcs_read_any and vmcs_write_any. + * branch prediction in vmcs12_read_any and vmcs12_write_any. */ /* 16-bits */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3aa21bec028d..63df3586f062 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1117,7 +1117,7 @@ int __kvm_set_memory_region(struct kvm *kvm, * * validation of sp->gfn happens in: * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) - * - kvm_is_visible_gfn (mmu_check_roots) + * - kvm_is_visible_gfn (mmu_check_root) */ kvm_arch_flush_shadow_memslot(kvm, slot); -- cgit v1.2.3 From 4d516fe7d38555e8fa08de5d68168b011a144fcb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:21 +0800 Subject: KVM: Fix some out-dated function names in comment Since commit b1346ab2afbe ("KVM: nVMX: Rename prepare_vmcs02_*_full to prepare_vmcs02_*_rare"), prepare_vmcs02_full has been renamed to prepare_vmcs02_rare. nested_vmx_merge_msr_bitmap is renamed to nested_vmx_prepare_msr_bitmap since commit c992384bde84 ("KVM: vmx: speed up MSR bitmap merge"). Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs_shadow_fields.h | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h index 5f3d95c18c39..cad128d1657b 100644 --- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h +++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h @@ -23,7 +23,7 @@ BUILD_BUG_ON(1) * * When adding or removing fields here, note that shadowed * fields must always be synced by prepare_vmcs02, not just - * prepare_vmcs02_full. + * prepare_vmcs02_rare. */ /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 48a3af8fac0f..a2e61ae007dd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2008,7 +2008,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the + * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. We update the vmcs01 here for L1 as well * since it will end up touching the MSR anyway now. @@ -2044,7 +2044,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * * For nested: * The handling of the MSR bitmap for L2 guests is done in - * nested_vmx_merge_msr_bitmap. We should not touch the + * nested_vmx_prepare_msr_bitmap. We should not touch the * vmcs02.msr_bitmap here since it gets completely overwritten * in the merging. */ -- cgit v1.2.3 From 67b0ae43df179fb095f32a011446e7a883758877 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:22 +0800 Subject: KVM: Fix some comment typos and missing parentheses Fix some typos and add missing parentheses in the comments. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c7d4640b7b1c..a48d5708f1f8 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1122,7 +1122,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; /* - * Clear apic_assist portion of f(struct hv_vp_assist_page + * Clear apic_assist portion of struct hv_vp_assist_page * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 679692b55f6d..ea402e741bd5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -969,7 +969,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, * - For single-destination interrupts, handle it in posted mode * - Else if vector hashing is enabled and it is a lowest-priority * interrupt, handle it in posted mode and use the following mechanism - * to find the destinaiton vCPU. + * to find the destination vCPU. * 1. For lowest-priority interrupts, store all the possible * destination vCPUs in an array. * 2. Use "guest vector % max number of destination vCPUs" to find diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7b01ef1d87e6..63ab49de324d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3427,7 +3427,7 @@ vmentry_failed: /* * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date - * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). + * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). * This function returns the new value we should put in vmcs12.guest_cr0. * It's not enough to just return the vmcs02 GUEST_CR0. Rather, * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a2e61ae007dd..6f7774f54f1c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6720,7 +6720,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * If PML is turned on, failure on enabling PML just results in failure * of creating the vcpu, therefore we can simplify PML logic (by * avoiding dealing with cases, such as enabling PML partially on vcpus - * for the guest, etc. + * for the guest), etc. */ if (enable_pml) { vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); -- cgit v1.2.3 From 00116795aa87ca309a4cf1eaa3d82614807c8668 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:23 +0800 Subject: KVM: Fix some grammar mistakes Fix some grammar mistakes in the comments. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 2 +- arch/x86/kvm/lapic.c | 2 +- virt/kvm/kvm_main.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 77538fd77dc2..7312aab33298 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -189,7 +189,7 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, /* * Return 0 for coalesced interrupts; for edge-triggered interrupts, * this only happens if a previous edge has not been delivered due - * do masking. For level interrupts, the remote_irr field tells + * to masking. For level interrupts, the remote_irr field tells * us if the interrupt is waiting for an EOI. * * RTC is special: it is edge-triggered, but userspace likes to know diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ea402e741bd5..88c3c0c6d1e3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -964,7 +964,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, } /* - * This routine tries to handler interrupts in posted mode, here is how + * This routine tries to handle interrupts in posted mode, here is how * it deals with different cases: * - For single-destination interrupts, handle it in posted mode * - Else if vector hashing is enabled and it is a lowest-priority diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 63df3586f062..f0501272268f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -964,7 +964,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, /* * Increment the new memslot generation a second time, dropping the - * update in-progress flag and incrementing then generation based on + * update in-progress flag and incrementing the generation based on * the number of address spaces. This provides a unique and easily * identifiable generation number while the memslots are in flux. */ -- cgit v1.2.3 From 2f9f5cddb29b4fbdf2d328c7a6326d53227e6329 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:24 +0800 Subject: KVM: hyperv: Fix some typos in vcpu unimpl info Fix some typos in vcpu unimpl info. It should be unhandled rather than uhandled. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a48d5708f1f8..b255b9e865e5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1059,7 +1059,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } @@ -1179,7 +1179,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; break; default: - vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } -- cgit v1.2.3 From 311497e0c5565e7d9cf7b0987d17626b228b8fec Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 11 Dec 2019 14:26:25 +0800 Subject: KVM: Fix some writing mistakes Fix some writing mistakes in the comments. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- virt/kvm/kvm_main.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e2b793477243..0b5c280644e5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -611,7 +611,7 @@ struct kvm_vcpu_arch { * Paging state of an L2 guest (used for nested npt) * * This context will save all necessary information to walk page tables - * of the an L2 guest. This context is only initialized for page table + * of an L2 guest. This context is only initialized for page table * walking and not for faulting since we never handle l2 page faults on * the host. */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6f7774f54f1c..c2ced79aee3e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1914,7 +1914,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } /* - * Writes msr value into into the appropriate "register". + * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f0501272268f..1a6d5ebd5c42 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1519,7 +1519,7 @@ static inline int check_user_page_hwpoison(unsigned long addr) /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. It's also the - * only part that runs if we can are in atomic context. + * only part that runs if we can in atomic context. */ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) -- cgit v1.2.3 From fe6ed369fca98e99df55c932b85782a5687526b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 15:24:32 -0800 Subject: KVM: VMX: Add non-canonical check on writes to RTIT address MSRs Reject writes to RTIT address MSRs if the data being written is a non-canonical address as the MSRs are subject to canonical checks, e.g. KVM will trigger an unchecked #GP when loading the values to hardware during pt_guest_enter(). Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c2ced79aee3e..aea4fa957fd2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2144,6 +2144,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges))) return 1; + if (is_noncanonical_address(data, vcpu)) + return 1; if (index % 2) vmx->pt_desc.guest.addr_b[index / 2] = data; else -- cgit v1.2.3 From e348ac7c9e34bb2109facaafd3eea65f47314a9d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 15:24:33 -0800 Subject: KVM: VMX: Add helper to consolidate up PT/RTIT WRMSR fault logic Add a helper to consolidate the common checks for writing PT MSRs, and opportunistically clean up the formatting of the affected code. No functional change intended. Cc: Chao Peng Cc: Luwei Kang Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 55 ++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aea4fa957fd2..857dd0898e5f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1057,6 +1057,12 @@ static unsigned long segment_base(u16 selector) } #endif +static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) +{ + return (pt_mode == PT_MODE_HOST_GUEST) && + !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); +} + static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; @@ -2102,47 +2108,48 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pt_update_intercept_for_msr(vmx); break; case MSR_IA32_RTIT_STATUS: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (data & MSR_IA32_RTIT_STATUS_MASK)) + if (!pt_can_write_msr(vmx)) + return 1; + if (data & MSR_IA32_RTIT_STATUS_MASK) return 1; vmx->pt_desc.guest.status = data; break; case MSR_IA32_RTIT_CR3_MATCH: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - !intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_cr3_filtering)) + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_cr3_filtering)) return 1; vmx->pt_desc.guest.cr3_match = data; break; case MSR_IA32_RTIT_OUTPUT_BASE: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (!intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_topa_output) && - !intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_single_range_output)) || - (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)) + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) + return 1; + if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK) return 1; vmx->pt_desc.guest.output_base = data; break; case MSR_IA32_RTIT_OUTPUT_MASK: - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (!intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_topa_output) && - !intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_single_range_output))) + if (!pt_can_write_msr(vmx)) + return 1; + if (!intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_topa_output) && + !intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_single_range_output)) return 1; vmx->pt_desc.guest.output_mask = data; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!pt_can_write_msr(vmx)) + return 1; index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; - if ((pt_mode != PT_MODE_HOST_GUEST) || - (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || - (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_num_address_ranges))) + if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, + PT_CAP_num_address_ranges)) return 1; if (is_noncanonical_address(data, vcpu)) return 1; -- cgit v1.2.3 From b11306b53b2540c6ba068c4deddb6a17d9f8d95b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 14:44:13 -0800 Subject: KVM: x86: Don't let userspace set host-reserved cr4 bits Calculate the host-reserved cr4 bits at runtime based on the system's capabilities (using logic similar to __do_cpuid_func()), and use the dynamically generated mask for the reserved bit check in kvm_set_cr4() instead using of the static CR4_RESERVED_BITS define. This prevents userspace from "enabling" features in cr4 that are not supported by the system, e.g. by ignoring KVM_GET_SUPPORTED_CPUID and specifying a bogus CPUID for the vCPU. Allowing userspace to set unsupported bits in cr4 can lead to a variety of undesirable behavior, e.g. failed VM-Enter, and in general increases KVM's attack surface. A crafty userspace can even abuse CR4.LA57 to induce an unchecked #GP on a WRMSR. On a platform without LA57 support: KVM_SET_CPUID2 // CPUID_7_0_ECX.LA57 = 1 KVM_SET_SREGS // CR4.LA57 = 1 KVM_SET_MSRS // KERNEL_GS_BASE = 0x0004000000000000 KVM_RUN leads to a #GP when writing KERNEL_GS_BASE into hardware: unchecked MSR access error: WRMSR to 0xc0000102 (tried to write 0x0004000000000000) at rIP: 0xffffffffa00f239a (vmx_prepare_switch_to_guest+0x10a/0x1d0 [kvm_intel]) Call Trace: kvm_arch_vcpu_ioctl_run+0x671/0x1c70 [kvm] kvm_vcpu_ioctl+0x36b/0x5d0 [kvm] do_vfs_ioctl+0xa1/0x620 ksys_ioctl+0x66/0x70 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x4c/0x170 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fc08133bf47 Note, the above sequence fails VM-Enter due to invalid guest state. Userspace can allow VM-Enter to succeed (after the WRMSR #GP) by adding a KVM_SET_SREGS w/ CR4.LA57=0 after KVM_SET_MSRS, in which case KVM will technically leak the host's KERNEL_GS_BASE into the guest. But, as KERNEL_GS_BASE is a userspace-defined value/address, the leak is largely benign as a malicious userspace would simply be exposing its own data to the guest, and attacking a benevolent userspace would require multiple bugs in the userspace VMM. Cc: stable@vger.kernel.org Cc: Jun Nakajima Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e62d2688bfec..8a907cd7b1e1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -93,6 +93,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif +static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; + #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ @@ -879,9 +881,38 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); +static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) +{ + u64 reserved_bits = CR4_RESERVED_BITS; + + if (!cpu_has(c, X86_FEATURE_XSAVE)) + reserved_bits |= X86_CR4_OSXSAVE; + + if (!cpu_has(c, X86_FEATURE_SMEP)) + reserved_bits |= X86_CR4_SMEP; + + if (!cpu_has(c, X86_FEATURE_SMAP)) + reserved_bits |= X86_CR4_SMAP; + + if (!cpu_has(c, X86_FEATURE_FSGSBASE)) + reserved_bits |= X86_CR4_FSGSBASE; + + if (!cpu_has(c, X86_FEATURE_PKU)) + reserved_bits |= X86_CR4_PKE; + + if (!cpu_has(c, X86_FEATURE_LA57) && + !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57))) + reserved_bits |= X86_CR4_LA57; + + if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated()) + reserved_bits |= X86_CR4_UMIP; + + return reserved_bits; +} + static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - if (cr4 & CR4_RESERVED_BITS) + if (cr4 & cr4_reserved_bits) return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) @@ -9400,6 +9431,8 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; + cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); + if (kvm_has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that -- cgit v1.2.3 From f1cdecf5807b1a91829a2dc4f254bfe6bafd4776 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 14:44:14 -0800 Subject: KVM: x86: Ensure all logical CPUs have consistent reserved cr4 bits Check the current CPU's reserved cr4 bits against the mask calculated for the boot CPU to ensure consistent behavior across all CPUs. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8a907cd7b1e1..960b886e1e43 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9461,6 +9461,13 @@ void kvm_arch_hardware_unsetup(void) int kvm_arch_check_processor_compat(void) { + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + WARN_ON(!irqs_disabled()); + + if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) + return -EIO; + return kvm_x86_ops->check_processor_compatibility(); } -- cgit v1.2.3 From 96be4e069c938e4a5fc7125de7e1cc7089b1adef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 14:44:15 -0800 Subject: KVM: x86: Drop special XSAVE handling from guest_cpuid_has() Now that KVM prevents setting host-reserved CR4 bits, drop the dedicated XSAVE check in guest_cpuid_has() in favor of open coding similar checks in the SVM/VMX XSAVES enabling flows. Note, checking boot_cpu_has(X86_FEATURE_XSAVE) in the XSAVES flows is technically redundant with respect to the CR4 reserved bit checks, e.g. XSAVES #UDs if CR4.OSXSAVE=0 and arch.xsaves_enabled is consumed if and only if CR4.OXSAVE=1 in guest. Keep (add?) the explicit boot_cpu_has() checks to help document KVM's usage of arch.xsaves_enabled. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 4 ---- arch/x86/kvm/svm.c | 1 + arch/x86/kvm/vmx/vmx.c | 1 + 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d78a61408243..b82ae4d3dc71 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -93,10 +93,6 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_ { int *reg; - if (x86_feature == X86_FEATURE_XSAVE && - !static_cpu_has(X86_FEATURE_XSAVE)) - return false; - reg = guest_cpuid_get_register(vcpu, x86_feature); if (!reg) return false; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9583ae7ae218..d399eb7bbff3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5917,6 +5917,7 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVES); /* Update nrips enabled cache */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 857dd0898e5f..3e732d092c40 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4049,6 +4049,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (vmx_xsaves_supported()) { /* Exposing XSAVES only when XSAVE is exposed */ bool xsaves_enabled = + boot_cpu_has(X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); -- cgit v1.2.3 From 345599f9a292899bf5474651f3cea9b7a0576436 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2019 14:44:16 -0800 Subject: KVM: x86: Add macro to ensure reserved cr4 bits checks stay in sync Add a helper macro to generate the set of reserved cr4 bits for both host and guest to ensure that adding a check on guest capabilities is also added for host capabilities, and vice versa. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 65 +++++++++++++++++++++--------------------------------- 1 file changed, 25 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 960b886e1e43..456fc131c95e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -881,31 +881,34 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); +#define __cr4_reserved_bits(__cpu_has, __c) \ +({ \ + u64 __reserved_bits = CR4_RESERVED_BITS; \ + \ + if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ + __reserved_bits |= X86_CR4_OSXSAVE; \ + if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ + __reserved_bits |= X86_CR4_SMEP; \ + if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ + __reserved_bits |= X86_CR4_SMAP; \ + if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ + __reserved_bits |= X86_CR4_FSGSBASE; \ + if (!__cpu_has(__c, X86_FEATURE_PKU)) \ + __reserved_bits |= X86_CR4_PKE; \ + if (!__cpu_has(__c, X86_FEATURE_LA57)) \ + __reserved_bits |= X86_CR4_LA57; \ + __reserved_bits; \ +}) + static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) { - u64 reserved_bits = CR4_RESERVED_BITS; - - if (!cpu_has(c, X86_FEATURE_XSAVE)) - reserved_bits |= X86_CR4_OSXSAVE; - - if (!cpu_has(c, X86_FEATURE_SMEP)) - reserved_bits |= X86_CR4_SMEP; - - if (!cpu_has(c, X86_FEATURE_SMAP)) - reserved_bits |= X86_CR4_SMAP; - - if (!cpu_has(c, X86_FEATURE_FSGSBASE)) - reserved_bits |= X86_CR4_FSGSBASE; + u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - if (!cpu_has(c, X86_FEATURE_PKU)) - reserved_bits |= X86_CR4_PKE; + if (cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)) + reserved_bits &= ~X86_CR4_LA57; - if (!cpu_has(c, X86_FEATURE_LA57) && - !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57))) - reserved_bits |= X86_CR4_LA57; - - if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated()) - reserved_bits |= X86_CR4_UMIP; + if (kvm_x86_ops->umip_emulated()) + reserved_bits &= ~X86_CR4_UMIP; return reserved_bits; } @@ -915,25 +918,7 @@ static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & cr4_reserved_bits) return -EINVAL; - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu)) return -EINVAL; return 0; -- cgit v1.2.3 From 5ae78e95ed0c771935d0d24291d221312524830c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:38 -0800 Subject: KVM: x86: Add dedicated emulator helpers for querying CPUID features Add feature-specific helpers for querying guest CPUID support from the emulator instead of having the emulator do a full CPUID and perform its own bit tests. The primary motivation is to eliminate the emulator's usage of bit() so that future patches can add more extensive build-time assertions on the usage of bit() without having to expose yet more code to the emulator. Note, providing a generic guest_cpuid_has() to the emulator doesn't work due to the existing built-time assertions in guest_cpuid_has(), which require the feature being checked to be a compile-time constant. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 4 ++++ arch/x86/kvm/emulate.c | 21 +++------------------ arch/x86/kvm/x86.c | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 77cf6c11f66b..03946eb3e2b9 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -222,6 +222,10 @@ struct x86_emulate_ops { bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit); + bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); + bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); + bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); + void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 952d1a4f4d7e..e9833e345a5c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2348,12 +2348,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) { #ifdef CONFIG_X86_64 - u32 eax, ebx, ecx, edx; - - eax = 0x80000001; - ecx = 0; - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - return edx & bit(X86_FEATURE_LM); + return ctxt->ops->guest_has_long_mode(ctxt); #else return false; #endif @@ -3618,18 +3613,11 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -#define FFL(x) bit(X86_FEATURE_##x) - static int em_movbe(struct x86_emulate_ctxt *ctxt) { - u32 ebx, ecx, edx, eax = 1; u16 tmp; - /* - * Check MOVBE is set in the guest-visible CPUID leaf. - */ - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - if (!(ecx & FFL(MOVBE))) + if (!ctxt->ops->guest_has_movbe(ctxt)) return emulate_ud(ctxt); switch (ctxt->op_bytes) { @@ -4027,10 +4015,7 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt) static int check_fxsr(struct x86_emulate_ctxt *ctxt) { - u32 eax = 1, ebx, ecx = 0, edx; - - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - if (!(edx & FFL(FXSR))) + if (!ctxt->ops->guest_has_fxsr(ctxt)) return emulate_ud(ctxt); if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 456fc131c95e..60b0d69af0f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6245,6 +6245,21 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); } +static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); +} + +static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); +} + +static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); +} + static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { return kvm_register_read(emul_to_vcpu(ctxt), reg); @@ -6322,6 +6337,9 @@ static const struct x86_emulate_ops emulate_ops = { .fix_hypercall = emulator_fix_hypercall, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, + .guest_has_long_mode = emulator_guest_has_long_mode, + .guest_has_movbe = emulator_guest_has_movbe, + .guest_has_fxsr = emulator_guest_has_fxsr, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, -- cgit v1.2.3 From a0a2260c12d8658e522f21ed8ece72bbdede58fd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:39 -0800 Subject: KVM: x86: Move bit() helper to cpuid.h Move bit() to cpuid.h in preparation for incorporating the reverse_cpuid array in bit() build-time assertions. Opportunistically use the BIT() macro instead of open-coding the shift. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 5 +++++ arch/x86/kvm/x86.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b82ae4d3dc71..f908f76216bf 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -55,6 +55,11 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, }; +static inline u32 bit(int bitno) +{ + return BIT(bitno & 31); +} + static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { unsigned x86_leaf = x86_feature / 32; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 9805cf2c6b35..ab715cee3653 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -144,11 +144,6 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu) return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu); } -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; -- cgit v1.2.3 From daa0d8c3a48732e5f64c69cca4c597cab1dfd455 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:40 -0800 Subject: KVM: x86: Add CPUID_7_1_EAX to the reverse CPUID table Add an entry for CPUID_7_1_EAX in the reserve_cpuid array in preparation for incorporating the array in bit() build-time assertions, specifically to avoid an assertion on F(AVX512_BF16) in do_cpuid_7_mask(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f908f76216bf..5fc424d2e1fa 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -53,6 +53,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, + [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, }; static inline u32 bit(int bitno) -- cgit v1.2.3 From a7c48c3f56db8d18d15ee6d8c2c5e2da447c77cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:41 -0800 Subject: KVM: x86: Expand build-time assertion on reverse CPUID usage Add build-time checks to ensure KVM isn't trying to do a reverse CPUID lookup on Linux-defined feature bits, along with comments to explain the gory details of X86_FEATUREs and bit(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/cpuid.h | 33 ++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index cf55629ff0ff..ca20951fac2b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -281,8 +281,9 @@ out: return r; } -static void cpuid_mask(u32 *word, int wordnum) +static __always_inline void cpuid_mask(u32 *word, int wordnum) { + reverse_cpuid_check(wordnum); *word &= boot_cpu_data.x86_capability[wordnum]; } diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5fc424d2e1fa..fb3c44b019ec 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -56,18 +56,41 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX}, }; -static inline u32 bit(int bitno) +/* + * Reverse CPUID and its derivatives can only be used for hardware-defined + * feature words, i.e. words whose bits directly correspond to a CPUID leaf. + * Retrieving a feature bit or masking guest CPUID from a Linux-defined word + * is nonsensical as the bit number/mask is an arbitrary software-defined value + * and can't be used by KVM to query/control guest capabilities. And obviously + * the leaf being queried must have an entry in the lookup table. + */ +static __always_inline void reverse_cpuid_check(unsigned x86_leaf) { - return BIT(bitno & 31); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); + BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); + BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); +} + +/* + * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain + * the hardware defined bit number (stored in bits 4:0) and a software defined + * "word" (stored in bits 31:5). The word is used to index into arrays of + * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has(). + */ +static __always_inline u32 bit(int x86_feature) +{ + reverse_cpuid_check(x86_feature / 32); + return 1 << (x86_feature & 31); } static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { unsigned x86_leaf = x86_feature / 32; - BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); - BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); - + reverse_cpuid_check(x86_leaf); return reverse_cpuid[x86_leaf]; } -- cgit v1.2.3 From 87382003e3555926017228452dae7e7064b0f915 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 17 Dec 2019 13:32:42 -0800 Subject: KVM: x86: Refactor and rename bit() to feature_bit() macro Rename bit() to __feature_bit() to give it a more descriptive name, and add a macro, feature_bit(), to stuff the X68_FEATURE_ prefix to keep line lengths manageable for code that hardcodes the bit to be retrieved. No functional change intended. Cc: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 8 +++++--- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 42 +++++++++++++++++++++--------------------- arch/x86/kvm/x86.c | 2 +- 5 files changed, 30 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ca20951fac2b..74a4d9b4e61f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -62,7 +62,7 @@ u64 kvm_supported_xcr0(void) return xcr0; } -#define F(x) bit(X86_FEATURE_##x) +#define F feature_bit int kvm_update_cpuid(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index fb3c44b019ec..7366c618aa04 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -80,12 +80,14 @@ static __always_inline void reverse_cpuid_check(unsigned x86_leaf) * "word" (stored in bits 31:5). The word is used to index into arrays of * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has(). */ -static __always_inline u32 bit(int x86_feature) +static __always_inline u32 __feature_bit(int x86_feature) { reverse_cpuid_check(x86_feature / 32); return 1 << (x86_feature & 31); } +#define feature_bit(name) __feature_bit(X86_FEATURE_##name) + static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { unsigned x86_leaf = x86_feature / 32; @@ -126,7 +128,7 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_ if (!reg) return false; - return *reg & bit(x86_feature); + return *reg & __feature_bit(x86_feature); } static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature) @@ -135,7 +137,7 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x8 reg = guest_cpuid_get_register(vcpu, x86_feature); if (reg) - *reg &= ~bit(x86_feature); + *reg &= ~__feature_bit(x86_feature); } static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d399eb7bbff3..31d6a98f5eda 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5929,14 +5929,14 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); } -#define F(x) bit(X86_FEATURE_##x) +#define F feature_bit static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { case 0x1: if (avic) - entry->ecx &= ~bit(X86_FEATURE_X2APIC); + entry->ecx &= ~F(X2APIC); break; case 0x80000001: if (nested) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3e732d092c40..bdbf27e92851 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6989,28 +6989,28 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) } while (0) entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); - cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); - cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); - cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); - cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); - cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); - cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); - cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); - cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); - cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); - cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); - cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); - cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); - cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); + cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); - cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); - cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); - cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); - cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); - cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); - cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57)); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); + cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); + cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); #undef cr4_fixed1_update } @@ -7144,7 +7144,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { if (func == 1 && nested) - entry->ecx |= bit(X86_FEATURE_VMX); + entry->ecx |= feature_bit(VMX); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 60b0d69af0f1..3e70af42f65b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -904,7 +904,7 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) { u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - if (cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)) + if (cpuid_ecx(0x7) & feature_bit(LA57)) reserved_bits &= ~X86_CR4_LA57; if (kvm_x86_ops->umip_emulated()) -- cgit v1.2.3 From d8010a779a2f3f92fb10d617b8aa1ddd772e987c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 14 Dec 2019 14:48:45 +0800 Subject: KVM: vmx: delete meaningless nested_vmx_prepare_msr_bitmap() declaration The function nested_vmx_prepare_msr_bitmap() declaration is below its implementation. So this is meaningless and should be removed. Signed-off-by: Miaohe Lin Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 63ab49de324d..e038a331583c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3048,9 +3048,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); - static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); -- cgit v1.2.3 From 52918ed5fcf05d97d257f4131e19479da18f5d16 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 9 Jan 2020 17:42:16 -0600 Subject: KVM: SVM: Override default MMIO mask if memory encryption is enabled The KVM MMIO support uses bit 51 as the reserved bit to cause nested page faults when a guest performs MMIO. The AMD memory encryption support uses a CPUID function to define the encryption bit position. Given this, it is possible that these bits can conflict. Use svm_hardware_setup() to override the MMIO mask if memory encryption support is enabled. Various checks are performed to ensure that the mask is properly defined and rsvd_bits() is used to generate the new mask (as was done prior to the change that necessitated this patch). Fixes: 28a1f3ac1d0c ("kvm: x86: Set highest physical address bits in non-present/reserved SPTEs") Suggested-by: Sean Christopherson Reviewed-by: Sean Christopherson Signed-off-by: Tom Lendacky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 31d6a98f5eda..b7c5369c7998 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1307,6 +1307,47 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } +/* + * The default MMIO mask is a single bit (excluding the present bit), + * which could conflict with the memory encryption bit. Check for + * memory encryption support and override the default MMIO mask if + * memory encryption is enabled. + */ +static __init void svm_adjust_mmio_mask(void) +{ + unsigned int enc_bit, mask_bit; + u64 msr, mask; + + /* If there is no memory encryption support, use existing mask */ + if (cpuid_eax(0x80000000) < 0x8000001f) + return; + + /* If memory encryption is not enabled, use existing mask */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; + mask_bit = boot_cpu_data.x86_phys_bits; + + /* Increment the mask bit if it is the same as the encryption bit */ + if (enc_bit == mask_bit) + mask_bit++; + + /* + * If the mask bit location is below 52, then some bits above the + * physical addressing limit will always be reserved, so use the + * rsvd_bits() function to generate the mask. This mask, along with + * the present bit, will be used to generate a page fault with + * PFER.RSV = 1. + * + * If the mask bit location is 52 (or above), then clear the mask. + */ + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1361,6 +1402,8 @@ static __init int svm_hardware_setup(void) } } + svm_adjust_mmio_mask(); + for_each_possible_cpu(cpu) { r = svm_cpu_init(cpu); if (r) -- cgit v1.2.3 From f8052a053a7af2a53288901d27d6419e100ad8e2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jan 2020 15:06:39 -0800 Subject: KVM: x86/mmu: Reorder the reserved bit check in prefetch_invalid_gpte() Move the !PRESENT and !ACCESSED checks in FNAME(prefetch_invalid_gpte) above the call to is_rsvd_bits_set(). For a well behaved guest, the !PRESENT and !ACCESSED are far more likely to evaluate true than the reserved bit checks, and they do not require additional memory accesses. Before: Dump of assembler code for function paging32_prefetch_invalid_gpte: 0x0000000000044240 <+0>: callq 0x44245 0x0000000000044245 <+5>: mov %rcx,%rax 0x0000000000044248 <+8>: shr $0x7,%rax 0x000000000004424c <+12>: and $0x1,%eax 0x000000000004424f <+15>: lea 0x0(,%rax,4),%r8 0x0000000000044257 <+23>: add %r8,%rax 0x000000000004425a <+26>: mov %rcx,%r8 0x000000000004425d <+29>: and 0x120(%rsi,%rax,8),%r8 0x0000000000044265 <+37>: mov 0x170(%rsi),%rax 0x000000000004426c <+44>: shr %cl,%rax 0x000000000004426f <+47>: and $0x1,%eax 0x0000000000044272 <+50>: or %rax,%r8 0x0000000000044275 <+53>: jne 0x4427c 0x0000000000044277 <+55>: test $0x1,%cl 0x000000000004427a <+58>: jne 0x4428a 0x000000000004427c <+60>: mov %rdx,%rsi 0x000000000004427f <+63>: callq 0x44080 0x0000000000044284 <+68>: mov $0x1,%eax 0x0000000000044289 <+73>: retq 0x000000000004428a <+74>: xor %eax,%eax 0x000000000004428c <+76>: and $0x20,%ecx 0x000000000004428f <+79>: jne 0x44289 0x0000000000044291 <+81>: mov %rdx,%rsi 0x0000000000044294 <+84>: callq 0x44080 0x0000000000044299 <+89>: mov $0x1,%eax 0x000000000004429e <+94>: jmp 0x44289 End of assembler dump. After: Dump of assembler code for function paging32_prefetch_invalid_gpte: 0x0000000000044240 <+0>: callq 0x44245 0x0000000000044245 <+5>: test $0x1,%cl 0x0000000000044248 <+8>: je 0x4424f 0x000000000004424a <+10>: test $0x20,%cl 0x000000000004424d <+13>: jne 0x4425d 0x000000000004424f <+15>: mov %rdx,%rsi 0x0000000000044252 <+18>: callq 0x44080 0x0000000000044257 <+23>: mov $0x1,%eax 0x000000000004425c <+28>: retq 0x000000000004425d <+29>: mov %rcx,%rax 0x0000000000044260 <+32>: mov (%rsi),%rsi 0x0000000000044263 <+35>: shr $0x7,%rax 0x0000000000044267 <+39>: and $0x1,%eax 0x000000000004426a <+42>: lea 0x0(,%rax,4),%r8 0x0000000000044272 <+50>: add %r8,%rax 0x0000000000044275 <+53>: mov %rcx,%r8 0x0000000000044278 <+56>: and 0x120(%rsi,%rax,8),%r8 0x0000000000044280 <+64>: mov 0x170(%rsi),%rax 0x0000000000044287 <+71>: shr %cl,%rax 0x000000000004428a <+74>: and $0x1,%eax 0x000000000004428d <+77>: mov %rax,%rcx 0x0000000000044290 <+80>: xor %eax,%eax 0x0000000000044292 <+82>: or %rcx,%r8 0x0000000000044295 <+85>: je 0x4425c 0x0000000000044297 <+87>: mov %rdx,%rsi 0x000000000004429a <+90>: callq 0x44080 0x000000000004429f <+95>: mov $0x1,%eax 0x00000000000442a4 <+100>: jmp 0x4425c End of assembler dump. Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index b53bed3c901c..1fde6a1c506d 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -175,9 +175,6 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - if (!FNAME(is_present_gpte)(gpte)) goto no_present; @@ -186,6 +183,9 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; + if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + return false; no_present: -- cgit v1.2.3 From b5c3c1b3c6e95cc67910e27a1e7603d838c2ebed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Jan 2020 15:06:40 -0800 Subject: KVM: x86/mmu: Micro-optimize nEPT's bad memptype/XWR checks Rework the handling of nEPT's bad memtype/XWR checks to micro-optimize the checks as much as possible. Move the check to a separate helper, __is_bad_mt_xwr(), which allows the guest_rsvd_check usage in paging_tmpl.h to omit the check entirely for paging32/64 (bad_mt_xwr is always zero for non-nEPT) while retaining the bitwise-OR of the current code for the shadow_zero_check in walk_shadow_page_get_mmio_spte(). Add a comment for the bitwise-OR usage in the mmio spte walk to avoid future attempts to "fix" the code, which is what prompted this optimization in the first place[*]. Opportunistically remove the superfluous '!= 0' and parantheses, and use BIT_ULL() instead of open coding its equivalent. The net effect is that code generation is largely unchanged for walk_shadow_page_get_mmio_spte(), marginally better for ept_prefetch_invalid_gpte(), and significantly improved for paging32/64_prefetch_invalid_gpte(). Note, walk_shadow_page_get_mmio_spte() can't use a templated version of the memtype/XRW as it works on the host's shadow PTEs, e.g. checks that KVM hasn't borked its EPT tables. Even if it could be templated, the benefits of having a single implementation far outweight the few uops that would be saved for NPT or non-TDP paging, e.g. most compilers inline it all the way to up kvm_mmu_page_fault(). [*] https://lkml.kernel.org/r/20200108001859.25254-1-sean.j.christopherson@intel.com Cc: Jim Mattson Cc: David Laight Cc: Arvind Sankar Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 26 ++++++++++++++------------ arch/x86/kvm/mmu/paging_tmpl.h | 19 +++++++++++++++++-- 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7269130ea5e2..2992ff7b42a7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3968,20 +3968,14 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, static bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { - int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; + int bit7 = (pte >> 7) & 1; - return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | - ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); + return pte & rsvd_check->rsvd_bits_mask[bit7][level-1]; } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) { - return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); -} - -static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) -{ - return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); + return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -4005,9 +3999,12 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; + struct rsvd_bits_validate *rsvd_check; int root, leaf; bool reserved = false; + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; + walk_shadow_page_lockless_begin(vcpu); for (shadow_walk_init(&iterator, vcpu, addr), @@ -4022,8 +4019,13 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) if (!is_shadow_present_pte(spte)) break; - reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte, - iterator.level); + /* + * Use a bitwise-OR instead of a logical-OR to aggregate the + * reserved bit and EPT's invalid memtype/XWR checks to avoid + * adding a Jcc in the loop. + */ + reserved |= __is_bad_mt_xwr(rsvd_check, spte) | + __is_rsvd_bits_set(rsvd_check, spte, iterator.level); } walk_shadow_page_lockless_end(vcpu); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 1fde6a1c506d..eaa00c4daeb1 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -128,6 +128,21 @@ static inline int FNAME(is_present_gpte)(unsigned long pte) #endif } +static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) +{ +#if PTTYPE != PTTYPE_EPT + return false; +#else + return __is_bad_mt_xwr(rsvd_check, gpte); +#endif +} + +static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) +{ + return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || + FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); +} + static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) @@ -183,7 +198,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; - if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) goto no_present; return false; @@ -400,7 +415,7 @@ retry_walk: if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; - if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { + if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } -- cgit v1.2.3 From a4d956b9390418623ae5d07933e2679c68b6f83c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 28 Dec 2019 14:25:24 +0800 Subject: KVM: nVMX: vmread should not set rflags to specify success in case of #PF In case writing to vmread destination operand result in a #PF, vmread should not call nested_vmx_succeed() to set rflags to specify success. Similar to as done in VMPTRST (See handle_vmptrst()). Reviewed-by: Liran Alon Signed-off-by: Miaohe Lin Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e038a331583c..ef2d53854d15 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4799,8 +4799,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu) instr_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) + if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) { kvm_inject_page_fault(vcpu, &e); + return 1; + } } return nested_vmx_succeed(vcpu); -- cgit v1.2.3 From e30a7d623dccdb3f880fbcad980b0cb589a1da45 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Jan 2020 16:12:10 -0800 Subject: KVM: x86/mmu: Apply max PA check for MMIO sptes to 32-bit KVM Remove the bogus 64-bit only condition from the check that disables MMIO spte optimization when the system supports the max PA, i.e. doesn't have any reserved PA bits. 32-bit KVM always uses PAE paging for the shadow MMU, and per Intel's SDM: PAE paging translates 32-bit linear addresses to 52-bit physical addresses. The kernel's restrictions on max physical addresses are limits on how much memory the kernel can reasonably use, not what physical addresses are supported by hardware. Fixes: ce88decffd17 ("KVM: MMU: mmio page fault support") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2992ff7b42a7..57e4dbddba72 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6193,7 +6193,7 @@ static void kvm_set_mmio_spte_mask(void) * If reserved bit is not supported, clear the present bit to disable * mmio page fault. */ - if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) + if (shadow_phys_bits == 52) mask &= ~1ull; kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); -- cgit v1.2.3 From 56871d444bc4d7ea66708775e62e2e0926384dbc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 18 Jan 2020 20:09:03 +0100 Subject: KVM: x86: fix overlap between SPTE_MMIO_MASK and generation The SPTE_MMIO_MASK overlaps with the bits used to track MMIO generation number. A high enough generation number would overwrite the SPTE_SPECIAL_MASK region and cause the MMIO SPTE to be misinterpreted. Likewise, setting bits 52 and 53 would also cause an incorrect generation number to be read from the PTE, though this was partially mitigated by the (useless if it weren't for the bug) removal of SPTE_SPECIAL_MASK from the spte in get_mmio_spte_generation. Drop that removal, and replace it with a compile-time assertion. Fixes: 6eeb4ef049e7 ("KVM: x86: assign two bits to track SPTE kinds") Reported-by: Ben Gardon Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 57e4dbddba72..b9052c7ba43d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -418,22 +418,24 @@ static inline bool is_access_track_spte(u64 spte) * requires a full MMU zap). The flag is instead explicitly queried when * checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0) +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0) #define MMIO_SPTE_GEN_LOW_START 3 #define MMIO_SPTE_GEN_LOW_END 11 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ MMIO_SPTE_GEN_LOW_START) -#define MMIO_SPTE_GEN_HIGH_START 52 -#define MMIO_SPTE_GEN_HIGH_END 61 +#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT +#define MMIO_SPTE_GEN_HIGH_END 62 #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ MMIO_SPTE_GEN_HIGH_START) + static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; @@ -444,8 +446,6 @@ static u64 get_mmio_spte_generation(u64 spte) { u64 gen; - spte &= ~shadow_mmio_mask; - gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; return gen; -- cgit v1.2.3 From 99634e3ec0d4e0df28ae465b10f3613a4ceee58b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 20 Jan 2020 14:22:55 +0100 Subject: KVM: x86: list MSR_IA32_UCODE_REV as an emulated MSR Even if it's read-only, it can still be written to by userspace. Let them know by adding it to KVM_GET_MSR_INDEX_LIST. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3e70af42f65b..9f24f5d16854 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1228,6 +1228,7 @@ static const u32 emulated_msrs_all[] = { MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, MSR_IA32_POWER_CTL, + MSR_IA32_UCODE_REV, /* * The following list leaves out MSRs whose values are determined -- cgit v1.2.3 From 0e20f5e25556c00ee813469d373b00abcf298708 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 13 Dec 2019 13:25:25 +0000 Subject: KVM: arm/arm64: Cleanup MMIO handling Our MMIO handling is a bit odd, in the sense that it uses an intermediate per-vcpu structure to store the various decoded information that describe the access. But the same information is readily available in the HSR/ESR_EL2 field, and we actually use this field to populate the structure. Let's simplify the whole thing by getting rid of the superfluous structure and save a (tiny) bit of space in the vcpu structure. [32bit fix courtesy of Olof Johansson ] Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_emulate.h | 5 ++- arch/arm/include/asm/kvm_host.h | 12 ++++--- arch/arm/include/asm/kvm_hyp.h | 1 + arch/arm/include/asm/kvm_mmio.h | 28 --------------- arch/arm64/include/asm/kvm_emulate.h | 3 +- arch/arm64/include/asm/kvm_host.h | 12 ++++--- arch/arm64/include/asm/kvm_mmio.h | 27 -------------- virt/kvm/arm/mmio.c | 70 ++++++++++++------------------------ virt/kvm/arm/mmu.c | 1 - 9 files changed, 42 insertions(+), 117 deletions(-) delete mode 100644 arch/arm/include/asm/kvm_mmio.h delete mode 100644 arch/arm64/include/asm/kvm_mmio.h (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h index 08d9805f613b..3944305e81df 100644 --- a/arch/arm/include/asm/kvm_emulate.h +++ b/arch/arm/include/asm/kvm_emulate.h @@ -9,7 +9,6 @@ #include #include -#include #include #include @@ -220,7 +219,7 @@ static inline bool kvm_vcpu_dabt_is_cm(struct kvm_vcpu *vcpu) } /* Get Access Size from a data abort */ -static inline int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu) +static inline unsigned int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu) { switch ((kvm_vcpu_get_hsr(vcpu) >> 22) & 0x3) { case 0: @@ -231,7 +230,7 @@ static inline int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu) return 4; default: kvm_err("Hardware is weird: SAS 0b11 is reserved\n"); - return -EFAULT; + return 4; } } diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 556cd818eccf..bd2233805d99 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -202,9 +201,6 @@ struct kvm_vcpu_arch { /* Don't run the guest (internal implementation need) */ bool pause; - /* IO related fields */ - struct kvm_decode mmio_decode; - /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; @@ -300,6 +296,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, static inline void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index) {} +/* MMIO helpers */ +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); + +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); +int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, + phys_addr_t fault_ipa); + static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index 40e9034db601..3c1b55ecc578 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #define __hyp_text __section(.hyp.text) notrace diff --git a/arch/arm/include/asm/kvm_mmio.h b/arch/arm/include/asm/kvm_mmio.h deleted file mode 100644 index 32fbf82e3ebc..000000000000 --- a/arch/arm/include/asm/kvm_mmio.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#ifndef __ARM_KVM_MMIO_H__ -#define __ARM_KVM_MMIO_H__ - -#include -#include -#include - -struct kvm_decode { - unsigned long rt; - bool sign_extend; - /* Not used on 32-bit arm */ - bool sixty_four; -}; - -void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); -unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); - -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa); - -#endif /* __ARM_KVM_MMIO_H__ */ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 53ea7637b7b2..688c63412cc2 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -341,7 +340,7 @@ static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu) return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_CM); } -static inline int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu) +static inline unsigned int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu) { return 1 << ((kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT); } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c61260cf63c5..f6a77ddab956 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -24,7 +24,6 @@ #include #include #include -#include #include #define __KVM_HAVE_ARCH_INTC_INITIALIZED @@ -325,9 +324,6 @@ struct kvm_vcpu_arch { /* Don't run the guest (internal implementation need) */ bool pause; - /* IO related fields */ - struct kvm_decode mmio_decode; - /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; @@ -491,6 +487,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); +/* MMIO helpers */ +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); + +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); +int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, + phys_addr_t fault_ipa); + int kvm_perf_init(void); int kvm_perf_teardown(void); diff --git a/arch/arm64/include/asm/kvm_mmio.h b/arch/arm64/include/asm/kvm_mmio.h deleted file mode 100644 index b204501a0c39..000000000000 --- a/arch/arm64/include/asm/kvm_mmio.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall - */ - -#ifndef __ARM64_KVM_MMIO_H__ -#define __ARM64_KVM_MMIO_H__ - -#include -#include - -struct kvm_decode { - unsigned long rt; - bool sign_extend; - /* Witdth of the register accessed by the faulting instruction is 64-bits */ - bool sixty_four; -}; - -void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); -unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); - -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa); - -#endif /* __ARM64_KVM_MMIO_H__ */ diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index 1bb71acd53f2..aedfcff99ac5 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -5,7 +5,6 @@ */ #include -#include #include #include @@ -92,26 +91,23 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->mmio_needed = 0; - if (!run->mmio.is_write) { - len = run->mmio.len; - if (len > sizeof(unsigned long)) - return -EINVAL; - + if (!kvm_vcpu_dabt_iswrite(vcpu)) { + len = kvm_vcpu_dabt_get_as(vcpu); data = kvm_mmio_read_buf(run->mmio.data, len); - if (vcpu->arch.mmio_decode.sign_extend && + if (kvm_vcpu_dabt_issext(vcpu) && len < sizeof(unsigned long)) { mask = 1U << ((len * 8) - 1); data = (data ^ mask) - mask; } - if (!vcpu->arch.mmio_decode.sixty_four) + if (!kvm_vcpu_dabt_issf(vcpu)) data = data & 0xffffffff; trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, &data); data = vcpu_data_host_to_guest(vcpu, data, len); - vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); + vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); } /* @@ -123,36 +119,6 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) return 0; } -static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) -{ - unsigned long rt; - int access_size; - bool sign_extend; - bool sixty_four; - - if (kvm_vcpu_dabt_iss1tw(vcpu)) { - /* page table accesses IO mem: tell guest to fix its TTBR */ - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - return 1; - } - - access_size = kvm_vcpu_dabt_get_as(vcpu); - if (unlikely(access_size < 0)) - return access_size; - - *is_write = kvm_vcpu_dabt_iswrite(vcpu); - sign_extend = kvm_vcpu_dabt_issext(vcpu); - sixty_four = kvm_vcpu_dabt_issf(vcpu); - rt = kvm_vcpu_dabt_get_rd(vcpu); - - *len = access_size; - vcpu->arch.mmio_decode.sign_extend = sign_extend; - vcpu->arch.mmio_decode.rt = rt; - vcpu->arch.mmio_decode.sixty_four = sixty_four; - - return 0; -} - int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, phys_addr_t fault_ipa) { @@ -164,15 +130,10 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, u8 data_buf[8]; /* - * Prepare MMIO operation. First decode the syndrome data we get - * from the CPU. Then try if some in-kernel emulation feels - * responsible, otherwise let user space do its magic. + * No valid syndrome? Ask userspace for help if it has + * voluntered to do so, and bail out otherwise. */ - if (kvm_vcpu_dabt_isvalid(vcpu)) { - ret = decode_hsr(vcpu, &is_write, &len); - if (ret) - return ret; - } else { + if (!kvm_vcpu_dabt_isvalid(vcpu)) { if (vcpu->kvm->arch.return_nisv_io_abort_to_user) { run->exit_reason = KVM_EXIT_ARM_NISV; run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); @@ -184,7 +145,20 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, return -ENOSYS; } - rt = vcpu->arch.mmio_decode.rt; + /* Page table accesses IO mem: tell guest to fix its TTBR */ + if (kvm_vcpu_dabt_iss1tw(vcpu)) { + kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); + return 1; + } + + /* + * Prepare MMIO operation. First decode the syndrome data we get + * from the CPU. Then try if some in-kernel emulation feels + * responsible, otherwise let user space do its magic. + */ + is_write = kvm_vcpu_dabt_iswrite(vcpu); + len = kvm_vcpu_dabt_get_as(vcpu); + rt = kvm_vcpu_dabt_get_rd(vcpu); if (is_write) { data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index e3ad95013192..a4fa81d75e84 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 290a6bb06de9ec24cecbb11bf4be35411d0b2625 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 20 Jan 2020 14:08:25 +0100 Subject: arm64: KVM: Add UAPI notes for swapped registers Two UAPI system register IDs do not derive their values from the ARM system register encodings. This is because their values were accidentally swapped. As the IDs are API, they cannot be changed. Add WARNING notes to point them out. Suggested-by: Marc Zyngier Signed-off-by: Andrew Jones [maz: turned XXX into WARNING] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20200120130825.28838-1-drjones@redhat.com --- Documentation/virt/kvm/api.txt | 9 +++++++++ arch/arm64/include/uapi/asm/kvm.h | 12 ++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt index ebb37b34dcfc..3a0c819c3573 100644 --- a/Documentation/virt/kvm/api.txt +++ b/Documentation/virt/kvm/api.txt @@ -2196,6 +2196,15 @@ arm64 CCSIDR registers are demultiplexed by CSSELR value: arm64 system registers have the following id bit patterns: 0x6030 0000 0013 +WARNING: + Two system register IDs do not follow the specified pattern. These + are KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT, which map to + system registers CNTV_CVAL_EL0 and CNTVCT_EL0 respectively. These + two had their values accidentally swapped, which means TIMER_CVAL is + derived from the register encoding for CNTVCT_EL0 and TIMER_CNT is + derived from the register encoding for CNTV_CVAL_EL0. As this is + API, it must remain this way. + arm64 firmware pseudo-registers have the following bit pattern: 0x6030 0000 0014 diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index 820e5751ada7..ba85bb23f060 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -220,10 +220,18 @@ struct kvm_vcpu_events { #define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2) #define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1) -/* EL0 Virtual Timer Registers */ +/* + * EL0 Virtual Timer Registers + * + * WARNING: + * KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT are not defined + * with the appropriate register encodings. Their values have been + * accidentally swapped. As this is set API, the definitions here + * must be used, rather than ones derived from the encodings. + */ #define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1) -#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) #define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2) +#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) /* KVM-as-firmware specific pseudo-registers */ #define KVM_REG_ARM_FW (0x0014 << KVM_REG_ARM_COPROC_SHIFT) -- cgit v1.2.3 From 6645d8542ef922486b733d415d2bec3b0622c27e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 20 Jan 2020 12:47:06 +0000 Subject: arm64: KVM: Annotate guest entry/exit as a single function In an effort to clarify and simplify the annotations of assembly functions in the kernel new macros have been introduced replacing ENTRY and ENDPROC. There are separate annotations SYM_FUNC_ for normal C functions and SYM_CODE_ for other code. Currently __guest_enter and __guest_exit are annotated as standard functions but this is not entirely correct as the former doesn't do a normal return and the latter is not entered in a normal fashion. From the point of view of the hypervisor the guest entry/exit may be viewed as a single function which happens to have an eret in the middle of it so let's annotate it as such. Suggested-by: Mark Rutland Signed-off-by: Mark Brown Signed-off-by: Marc Zyngier Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200120124706.8681-1-broonie@kernel.org --- arch/arm64/kvm/hyp/entry.S | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index e5cc8d66bf53..5b76a89939b1 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -44,7 +44,7 @@ * u64 __guest_enter(struct kvm_vcpu *vcpu, * struct kvm_cpu_context *host_ctxt); */ -ENTRY(__guest_enter) +SYM_FUNC_START(__guest_enter) // x0: vcpu // x1: host context // x2-x17: clobbered by macros @@ -96,9 +96,8 @@ alternative_else_nop_endif // Do not touch any register after this! eret sb -ENDPROC(__guest_enter) -ENTRY(__guest_exit) +SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL) // x0: return code // x1: vcpu // x2-x29,lr: vcpu regs @@ -192,4 +191,4 @@ abort_guest_exit_end: msr spsr_el2, x4 orr x0, x0, x5 1: ret -ENDPROC(__guest_exit) +SYM_FUNC_END(__guest_enter) -- cgit v1.2.3 From 6441fa6178f5456d1d4b512c08798888f99db185 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 20 Jan 2020 16:33:06 +0100 Subject: KVM: x86: avoid incorrect writes to host MSR_IA32_SPEC_CTRL If the guest is configured to have SPEC_CTRL but the host does not (which is a nonsensical configuration but these are not explicitly forbidden) then a host-initiated MSR write can write vmx->spec_ctrl (respectively svm->spec_ctrl) and trigger a #GP when KVM tries to restore the host value of the MSR. Add a more comprehensive check for valid bits of SPEC_CTRL, covering host CPUID flags and, since we are at it and it is more correct that way, guest CPUID flags too. For AMD, remove the unnecessary is_guest_mode check around setting the MSR interception bitmap, so that the code looks the same as for Intel. Cc: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 9 +++------ arch/x86/kvm/vmx/vmx.c | 7 +++---- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ arch/x86/kvm/x86.h | 1 + 4 files changed, 29 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b7c5369c7998..235a7e51de96 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4324,12 +4324,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) return 1; svm->spec_ctrl = data; - if (!data) break; @@ -4353,13 +4351,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & ~PRED_CMD_IBPB) return 1; - + if (!boot_cpu_has(X86_FEATURE_AMD_IBPB)) + return 1; if (!data) break; wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); - if (is_guest_mode(vcpu)) - break; set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; case MSR_AMD64_VIRT_SPEC_CTRL: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bdbf27e92851..112d2314231d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1998,12 +1998,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; - /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) + if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) return 1; vmx->spec_ctrl = data; - if (!data) break; @@ -2037,7 +2035,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~PRED_CMD_IBPB) return 1; - + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL)) + return 1; if (!data) break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f24f5d16854..b690c0d70793 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10389,6 +10389,28 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) +{ + uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; + + /* The STIBP bit doesn't fault even if it's not advertised */ + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && + !boot_cpu_has(X86_FEATURE_AMD_IBRS)) + bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) + bits &= ~SPEC_CTRL_SSBD; + if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && + !boot_cpu_has(X86_FEATURE_AMD_SSBD)) + bits &= ~SPEC_CTRL_SSBD; + + return bits; +} +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index ab715cee3653..dd6e34d0a881 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -367,5 +367,6 @@ static inline bool kvm_pat_valid(u64 data) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); +u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); #endif -- cgit v1.2.3 From 1a978d9d3e72ddfa40ac60d26301b154247ee0bc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:46 -0800 Subject: KVM: PPC: Book3S HV: Uninit vCPU if vcore creation fails Call kvm_vcpu_uninit() if vcore creation fails to avoid leaking any resources allocated by kvm_vcpu_init(), i.e. the vcpu->run page. Fixes: 371fefd6f2dc4 ("KVM: PPC: Allow book3s_hv guests to use SMT processor modes") Cc: stable@vger.kernel.org Reviewed-by: Greg Kurz Signed-off-by: Sean Christopherson Acked-by: Paul Mackerras Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6ff3f896d908..ef6aa63b071b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2368,7 +2368,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, mutex_unlock(&kvm->lock); if (!vcore) - goto free_vcpu; + goto uninit_vcpu; spin_lock(&vcore->lock); ++vcore->num_threads; @@ -2385,6 +2385,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, return vcpu; +uninit_vcpu: + kvm_vcpu_uninit(vcpu); free_vcpu: kmem_cache_free(kvm_vcpu_cache, vcpu); out: -- cgit v1.2.3 From cb10bf9194f4d2c5d830eddca861f7ca0fecdbb4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:47 -0800 Subject: KVM: PPC: Book3S PR: Free shared page if mmu initialization fails Explicitly free the shared page if kvmppc_mmu_init() fails during kvmppc_core_vcpu_create(), as the page is freed only in kvmppc_core_vcpu_free(), which is not reached via kvm_vcpu_uninit(). Fixes: 96bc451a15329 ("KVM: PPC: Introduce shared page") Cc: stable@vger.kernel.org Reviewed-by: Greg Kurz Signed-off-by: Sean Christopherson Acked-by: Paul Mackerras Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_pr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index ce4fcf76e53e..26ca62b6d773 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1806,10 +1806,12 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, err = kvmppc_mmu_init(vcpu); if (err < 0) - goto uninit_vcpu; + goto free_shared_page; return vcpu; +free_shared_page: + free_page((unsigned long)vcpu->arch.shared); uninit_vcpu: kvm_vcpu_uninit(vcpu); free_shadow_vcpu: -- cgit v1.2.3 From 16be9ddea268ad841457a59109963fff8c9de38d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:48 -0800 Subject: KVM: x86: Free wbinvd_dirty_mask if vCPU creation fails Free the vCPU's wbinvd_dirty_mask if vCPU creation fails after kvm_arch_vcpu_init(), e.g. when installing the vCPU's file descriptor. Do the freeing by calling kvm_arch_vcpu_free() instead of open coding the freeing. This adds a likely superfluous, but ultimately harmless, call to kvmclock_reset(), which only clears vcpu->arch.pv_time_enabled. Using kvm_arch_vcpu_free() allows for additional cleanup in the future. Fixes: f5f48ee15c2ee ("KVM: VMX: Execute WBINVD to keep data consistency with assigned devices") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b690c0d70793..a3eeeb5f303e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9243,7 +9243,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); - kvm_x86_ops->vcpu_free(vcpu); + kvm_arch_vcpu_free(vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) -- cgit v1.2.3 From 034d8e2cb929ed73e32e2dba98cb8067eab85964 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:49 -0800 Subject: KVM: VMX: Allocate VPID after initializing VCPU Do VPID allocation after calling the common kvm_vcpu_init() as a step towards doing vCPU allocation (via kmem_cache_zalloc()) and calling kvm_vcpu_init() back-to-back. Squishing allocation and initialization together will eventually allow the sequence to be moved to arch-agnostic creation code. Note, the VPID is not consumed until KVM_RUN, slightly delaying its allocation should have no real function impact. VPID allocation was arbitrarily placed in the original patch, commit 2384d2b326408 ("KVM: VMX: Enable Virtual Processor Identification (VPID)"). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 112d2314231d..42171b4013a8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6717,14 +6717,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_user_fpu; } - vmx->vpid = allocate_vpid(); - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) goto free_vcpu; err = -ENOMEM; + vmx->vpid = allocate_vpid(); + /* * If PML is turned on, failure on enabling PML just results in failure * of creating the vcpu, therefore we can simplify PML logic (by @@ -6835,8 +6835,8 @@ free_pml: vmx_destroy_pml_buffer(vmx); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); -free_vcpu: free_vpid(vmx->vpid); +free_vcpu: kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); -- cgit v1.2.3 From 34109c0476f10c033945b630a58c087e9d0ef8a0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:50 -0800 Subject: KVM: VMX: Use direct vcpu pointer during vCPU create/free Capture the vcpu pointer in a local varaible and replace '&vmx->vcpu' references with a direct reference to the pointer in anticipation of moving bits of the code to common x86 and passing the vcpu pointer into vmx_create_vcpu(), i.e. eliminate unnecessary noise from future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 42171b4013a8..e2da9082df89 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6682,17 +6682,17 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, vmx); } static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { - int err; + struct kvm_vcpu *vcpu; struct vcpu_vmx *vmx; unsigned long *msr_bitmap; - int i, cpu; + int i, cpu, err; BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, "struct kvm_vcpu must be at offset 0 for arch usercopy region"); @@ -6701,23 +6701,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); - vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vmx->vcpu.arch.user_fpu) { + vcpu = &vmx->vcpu; + + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); err = -ENOMEM; goto free_partial_vcpu; } - vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vmx->vcpu.arch.guest_fpu) { + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; goto free_user_fpu; } - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); + err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_vcpu; @@ -6789,12 +6791,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); - vmx_vcpu_load(&vmx->vcpu, cpu); - vmx->vcpu.cpu = cpu; + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; init_vmcs(vmx); - vmx_vcpu_put(&vmx->vcpu); + vmx_vcpu_put(vcpu); put_cpu(); - if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { + if (cpu_need_virtualize_apic_accesses(vcpu)) { err = alloc_apic_access_page(kvm); if (err) goto free_vmcs; @@ -6809,7 +6811,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (nested) nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, vmx_capability.ept, - kvm_vcpu_apicv_active(&vmx->vcpu)); + kvm_vcpu_apicv_active(vcpu)); else memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); @@ -6827,19 +6829,19 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->ept_pointer = INVALID_PAGE; - return &vmx->vcpu; + return vcpu; free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); free_pml: vmx_destroy_pml_buffer(vmx); uninit_vcpu: - kvm_vcpu_uninit(&vmx->vcpu); + kvm_vcpu_uninit(vcpu); free_vpid(vmx->vpid); free_vcpu: - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: - kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); free_partial_vcpu: kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); -- cgit v1.2.3 From 7f27179a88a693f2d357860fddef65704578edf5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:51 -0800 Subject: KVM: SVM: Use direct vcpu pointer during vCPU create/free Capture the vcpu pointer in a local varaible and replace '&svm->vcpu' references with a direct reference to the pointer in anticipation of moving bits of the code to common x86 and passing the vcpu pointer into svm_create_vcpu(), i.e. eliminate unnecessary noise from future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 235a7e51de96..b0d9045cf115 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2189,6 +2189,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { + struct kvm_vcpu *vcpu; struct vcpu_svm *svm; struct page *page; struct page *msrpm_pages; @@ -2204,24 +2205,25 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = -ENOMEM; goto out; } + vcpu = &svm->vcpu; - svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!svm->vcpu.arch.user_fpu) { + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); err = -ENOMEM; goto free_partial_svm; } - svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!svm->vcpu.arch.guest_fpu) { + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; goto free_user_fpu; } - err = kvm_vcpu_init(&svm->vcpu, kvm, id); + err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_svm; @@ -2265,9 +2267,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->asid_generation = 0; init_vmcb(svm); - svm_init_osvw(&svm->vcpu); + svm_init_osvw(vcpu); - return &svm->vcpu; + return vcpu; free_page4: __free_page(hsave_page); @@ -2278,11 +2280,11 @@ free_page2: free_page1: __free_page(page); uninit: - kvm_vcpu_uninit(&svm->vcpu); + kvm_vcpu_uninit(vcpu); free_svm: - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); free_partial_svm: kmem_cache_free(kvm_vcpu_cache, svm); out: @@ -2313,8 +2315,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); - kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, svm); } -- cgit v1.2.3 From a9dd6f09d7e54d3f58be32d7d051196f7a00e69e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:52 -0800 Subject: KVM: x86: Allocate vcpu struct in common x86 code Move allocation of VMX and SVM vcpus to common x86. Although the struct being allocated is technically a VMX/SVM struct, it can be interpreted directly as a 'struct kvm_vcpu' because of the pre-existing requirement that 'struct kvm_vcpu' be located at offset zero of the arch/vendor vcpu struct. Remove the message from the build-time assertions regarding placement of the struct, as compatibility with the arch usercopy region is no longer the sole dependent on 'struct kvm_vcpu' being at offset zero. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 28 +++++++++------------------- arch/x86/kvm/vmx/vmx.c | 24 ++++++++---------------- arch/x86/kvm/x86.c | 16 ++++++++++++---- 4 files changed, 30 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0b5c280644e5..aa591a77072b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1050,7 +1050,7 @@ struct kvm_x86_ops { void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ - struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); + int (*vcpu_create)(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b0d9045cf115..319c487e2222 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2187,9 +2187,9 @@ static int avic_init_vcpu(struct vcpu_svm *svm) return ret; } -static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) +static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - struct kvm_vcpu *vcpu; struct vcpu_svm *svm; struct page *page; struct page *msrpm_pages; @@ -2197,22 +2197,15 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; - BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0, - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); - - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); - if (!svm) { - err = -ENOMEM; - goto out; - } - vcpu = &svm->vcpu; + BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); + svm = to_svm(vcpu); vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); err = -ENOMEM; - goto free_partial_svm; + goto out; } vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, @@ -2225,7 +2218,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_svm; + goto free_guest_fpu; err = -ENOMEM; page = alloc_page(GFP_KERNEL_ACCOUNT); @@ -2269,7 +2262,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm_init_osvw(vcpu); - return vcpu; + return 0; free_page4: __free_page(hsave_page); @@ -2281,14 +2274,12 @@ free_page1: __free_page(page); uninit: kvm_vcpu_uninit(vcpu); -free_svm: +free_guest_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -free_partial_svm: - kmem_cache_free(kvm_vcpu_cache, svm); out: - return ERR_PTR(err); + return err; } static void svm_clear_current_vmcb(struct vmcb *vmcb) @@ -2317,7 +2308,6 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) kvm_vcpu_uninit(vcpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - kmem_cache_free(kvm_vcpu_cache, svm); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e2da9082df89..2cbeb0a638aa 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6684,31 +6684,24 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) kvm_vcpu_uninit(vcpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - kmem_cache_free(kvm_vcpu_cache, vmx); } -static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) +static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - struct kvm_vcpu *vcpu; struct vcpu_vmx *vmx; unsigned long *msr_bitmap; int i, cpu, err; - BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); - - vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); - if (!vmx) - return ERR_PTR(-ENOMEM); - - vcpu = &vmx->vcpu; + BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); + vmx = to_vmx(vcpu); vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); err = -ENOMEM; - goto free_partial_vcpu; + goto out; } vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, @@ -6829,7 +6822,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->ept_pointer = INVALID_PAGE; - return vcpu; + return 0; free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); @@ -6842,9 +6835,8 @@ free_vcpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -free_partial_vcpu: - kmem_cache_free(kvm_vcpu_cache, vmx); - return ERR_PTR(err); +out: + return err; } #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3eeeb5f303e..cfcefdbe2784 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9172,26 +9172,34 @@ static void fx_init(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { - void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; - kvmclock_reset(vcpu); kvm_x86_ops->vcpu_free(vcpu); - free_cpumask_var(wbinvd_dirty_mask); + + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + kmem_cache_free(kvm_vcpu_cache, vcpu); } struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; + int r; if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - vcpu = kvm_x86_ops->vcpu_create(kvm, id); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); + if (!vcpu) + return ERR_PTR(-ENOMEM); + r = kvm_x86_ops->vcpu_create(kvm, vcpu, id); + if (r) { + kmem_cache_free(kvm_vcpu_cache, vcpu); + return ERR_PTR(r); + } return vcpu; } -- cgit v1.2.3 From fc6e2a1845abfcfa335aef5ffaac664e104d72ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:53 -0800 Subject: KVM: x86: Move FPU allocation to common x86 code The allocation of FPU structs is identical across VMX and SVM, move it to common x86 code. Somewhat arbitrarily place the allocation so that it resides directly above the associated initialization via fx_init(), e.g. instead of retaining its position with respect to the overall vcpu creation flow. Although the names names kvm_arch_vcpu_create() and kvm_arch_vcpu_init() might suggest otherwise, x86 does not have a clean split between 'create' and 'init'. Allocating the struct immediately prior to the first use arguably improves readability *now*, and will yield even bigger improvements when kvm_arch_vcpu_init() is removed in a future patch. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 25 +------------------------ arch/x86/kvm/vmx/vmx.c | 25 +------------------------ arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 319c487e2222..e8a5cd44dd59 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2200,25 +2200,9 @@ static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); svm = to_svm(vcpu); - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); - err = -ENOMEM; - goto out; - } - - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { - printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); - err = -ENOMEM; - goto free_user_fpu; - } - err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_guest_fpu; + return err; err = -ENOMEM; page = alloc_page(GFP_KERNEL_ACCOUNT); @@ -2274,11 +2258,6 @@ free_page1: __free_page(page); uninit: kvm_vcpu_uninit(vcpu); -free_guest_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -out: return err; } @@ -2306,8 +2285,6 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2cbeb0a638aa..40c47d2709bb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6682,8 +6682,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kvm_vcpu_uninit(vcpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); } static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, @@ -6696,25 +6694,9 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); - err = -ENOMEM; - goto out; - } - - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { - printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); - err = -ENOMEM; - goto free_user_fpu; - } - err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_vcpu; + return err; err = -ENOMEM; @@ -6831,11 +6813,6 @@ free_pml: uninit_vcpu: kvm_vcpu_uninit(vcpu); free_vpid(vmx->vpid); -free_vcpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -out: return err; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cfcefdbe2784..29d058db3207 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9177,6 +9177,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } @@ -9543,6 +9545,21 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; } + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { + pr_err("kvm: failed to allocate userspace's fpu\n"); + r = -ENOMEM; + goto free_wbinvd_dirty_mask; + } + + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { + pr_err("kvm: failed to allocate vcpu's fpu\n"); + r = -ENOMEM; + goto free_user_fpu; + } fx_init(vcpu); vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; @@ -9561,6 +9578,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) return 0; +free_user_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: -- cgit v1.2.3 From d813a8ba54f94fd6a0276230bdf53c97b36c2101 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:54 -0800 Subject: KVM: x86: Move allocation of pio_data page down a few lines Allocate the pio_data page after creating the MMU and local APIC so that all direct memory allocations are grouped together. This allows setting the return value to -ENOMEM prior to starting the allocations instead of setting it in the fail path for every allocation. The pio_data page is only consumed when KVM_RUN is invoked, i.e. moving its allocation has no real functional impact. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 29d058db3207..50110bca7d57 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9510,18 +9510,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->arch.pio_data = page_address(page); - kvm_set_tsc_khz(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) - goto fail_free_pio_data; + return r; if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); @@ -9531,25 +9524,27 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } else static_key_slow_inc(&kvm_no_apic_vcpu); + r = -ENOMEM; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto fail_free_lapic; + vcpu->arch.pio_data = page_address(page); + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) { - r = -ENOMEM; - goto fail_free_lapic; - } + if (!vcpu->arch.mce_banks) + goto fail_free_pio_data; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, - GFP_KERNEL_ACCOUNT)) { - r = -ENOMEM; + GFP_KERNEL_ACCOUNT)) goto fail_free_mce_banks; - } vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { pr_err("kvm: failed to allocate userspace's fpu\n"); - r = -ENOMEM; goto free_wbinvd_dirty_mask; } @@ -9557,7 +9552,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) GFP_KERNEL_ACCOUNT); if (!vcpu->arch.guest_fpu) { pr_err("kvm: failed to allocate vcpu's fpu\n"); - r = -ENOMEM; goto free_user_fpu; } fx_init(vcpu); @@ -9584,13 +9578,12 @@ free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); +fail_free_pio_data: + free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); -fail: return r; } -- cgit v1.2.3 From 987b2594ed5d128c95c5255a9c7755f7480bf407 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:55 -0800 Subject: KVM: x86: Move kvm_vcpu_init() invocation to common code Move the kvm_cpu_{un}init() calls to common x86 code as an intermediate step to removing kvm_cpu_{un}init() altogether. Note, VMX'x alloc_apic_access_page() and init_rmode_identity_map() are per-VM allocations and are intentionally kept if vCPU creation fails. They are freed by kvm_arch_destroy_vm(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 13 +++---------- arch/x86/kvm/vmx/vmx.c | 19 ++++++------------- arch/x86/kvm/x86.c | 20 +++++++++++++++----- 4 files changed, 25 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aa591a77072b..fff9ed6956b5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1050,7 +1050,7 @@ struct kvm_x86_ops { void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ - int (*vcpu_create)(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned id); + int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e8a5cd44dd59..83257a7a2e37 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2187,8 +2187,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) return ret; } -static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int svm_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *page; @@ -2200,14 +2199,10 @@ static int svm_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); svm = to_svm(vcpu); - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - err = -ENOMEM; page = alloc_page(GFP_KERNEL_ACCOUNT); if (!page) - goto uninit; + goto out; msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!msrpm_pages) @@ -2256,8 +2251,7 @@ free_page2: __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); free_page1: __free_page(page); -uninit: - kvm_vcpu_uninit(vcpu); +out: return err; } @@ -2284,7 +2278,6 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); - kvm_vcpu_uninit(vcpu); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 40c47d2709bb..2134726b0442 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6681,11 +6681,9 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); - kvm_vcpu_uninit(vcpu); } -static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx; unsigned long *msr_bitmap; @@ -6694,10 +6692,6 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - err = -ENOMEM; vmx->vpid = allocate_vpid(); @@ -6711,7 +6705,7 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, if (enable_pml) { vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) - goto uninit_vcpu; + goto free_vpid; } BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS); @@ -6756,7 +6750,7 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - if (kvm_cstate_in_guest(kvm)) { + if (kvm_cstate_in_guest(vcpu->kvm)) { vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); @@ -6772,13 +6766,13 @@ static int vmx_create_vcpu(struct kvm *kvm, struct kvm_vcpu *vcpu, vmx_vcpu_put(vcpu); put_cpu(); if (cpu_need_virtualize_apic_accesses(vcpu)) { - err = alloc_apic_access_page(kvm); + err = alloc_apic_access_page(vcpu->kvm); if (err) goto free_vmcs; } if (enable_ept && !enable_unrestricted_guest) { - err = init_rmode_identity_map(kvm); + err = init_rmode_identity_map(vcpu->kvm); if (err) goto free_vmcs; } @@ -6810,8 +6804,7 @@ free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); free_pml: vmx_destroy_pml_buffer(vmx); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); +free_vpid: free_vpid(vmx->vpid); return err; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 50110bca7d57..51292843afcb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9176,6 +9176,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); + kvm_vcpu_uninit(vcpu); + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); @@ -9197,12 +9199,20 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, if (!vcpu) return ERR_PTR(-ENOMEM); - r = kvm_x86_ops->vcpu_create(kvm, vcpu, id); - if (r) { - kmem_cache_free(kvm_vcpu_cache, vcpu); - return ERR_PTR(r); - } + r = kvm_vcpu_init(vcpu, kvm, id); + if (r) + goto free_vcpu; + + r = kvm_x86_ops->vcpu_create(vcpu); + if (r) + goto uninit_vcpu; return vcpu; + +uninit_vcpu: + kvm_vcpu_uninit(vcpu); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); + return ERR_PTR(r); } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 3ec8ca29647078db11a820ab22855dd64d9a4897 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:56 -0800 Subject: KVM: PPC: e500mc: Add build-time assert that vcpu is at offset 0 In preparation for moving vcpu allocation to common PPC code, add an explicit, albeit redundant, build-time assert to ensure the vcpu member is located at offset 0. The assert is redundant in the sense that kvmppc_core_vcpu_create_e500() contains a functionally identical assert. The motiviation for adding the extra assert is to provide visual confirmation of the correctness of moving vcpu allocation to common code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500mc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 318e65c65999..c51f4bb086fd 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -308,6 +308,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu; int err; + BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu_e500) { err = -ENOMEM; -- cgit v1.2.3 From c50bfbdc38ec56cf8e53afb4f9ebb600dfcabd49 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:57 -0800 Subject: KVM: PPC: Allocate vcpu struct in common PPC code Move allocation of all flavors of PPC vCPUs to common PPC code. All variants either allocate 'struct kvm_vcpu' directly, or require that the embedded 'struct kvm_vcpu' member be located at offset 0, i.e. guarantee that the allocation can be directly interpreted as a 'struct kvm_vcpu' object. Remove the message from the build-time assertion regarding placement of the struct, as compatibility with the arch usercopy region is no longer the sole dependent on 'struct kvm_vcpu' being at offset zero. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/include/asm/kvm_ppc.h | 7 ++++--- arch/powerpc/kvm/book3s.c | 5 +++-- arch/powerpc/kvm/book3s_hv.c | 20 +++++--------------- arch/powerpc/kvm/book3s_pr.c | 18 +++++------------- arch/powerpc/kvm/booke.c | 5 +++-- arch/powerpc/kvm/e500.c | 26 +++++++------------------- arch/powerpc/kvm/e500mc.c | 24 ++++++------------------ arch/powerpc/kvm/powerpc.c | 23 ++++++++++++++++++----- 8 files changed, 51 insertions(+), 77 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 3d2f871241a8..8f77ca5ace6f 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -119,8 +119,8 @@ extern int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid, enum xlate_readwrite xlrw, struct kvmppc_pte *pte); -extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, - unsigned int id); +extern int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id); extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu); extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu); extern int kvmppc_core_check_processor_compat(void); @@ -274,7 +274,8 @@ struct kvmppc_ops { void (*inject_interrupt)(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr); int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); - struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned int id); + int (*vcpu_create)(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id); void (*vcpu_free)(struct kvm_vcpu *vcpu); int (*check_requests)(struct kvm_vcpu *vcpu); int (*get_dirty_log)(struct kvm *kvm, struct kvm_dirty_log *log); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 58a59ee998e2..13385656b90d 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -789,9 +789,10 @@ void kvmppc_decrementer_func(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - return kvm->arch.kvm_ops->vcpu_create(kvm, id); + return kvm->arch.kvm_ops->vcpu_create(kvm, vcpu, id); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ef6aa63b071b..a14fb6a9ea5d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2271,22 +2271,16 @@ static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) } #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ -static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_hv(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - struct kvm_vcpu *vcpu; int err; int core; struct kvmppc_vcore *vcore; - err = -ENOMEM; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; - err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_vcpu; + return err; vcpu->arch.shared = &vcpu->arch.shregs; #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -2383,14 +2377,11 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, debugfs_vcpu_init(vcpu, id); - return vcpu; + return 0; uninit_vcpu: kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(err); + return err; } static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, @@ -2445,7 +2436,6 @@ static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); spin_unlock(&vcpu->arch.vpa_update_lock); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 26ca62b6d773..0d7c8a7bcb7b 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1744,21 +1744,16 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, return r; } -static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { struct kvmppc_vcpu_book3s *vcpu_book3s; - struct kvm_vcpu *vcpu; int err = -ENOMEM; unsigned long p; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; - vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); if (!vcpu_book3s) - goto free_vcpu; + goto out; vcpu->arch.book3s = vcpu_book3s; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER @@ -1808,7 +1803,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, if (err < 0) goto free_shared_page; - return vcpu; + return 0; free_shared_page: free_page((unsigned long)vcpu->arch.shared); @@ -1820,10 +1815,8 @@ free_shadow_vcpu: free_vcpu3s: #endif vfree(vcpu_book3s); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); out: - return ERR_PTR(err); + return err; } static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) @@ -1836,7 +1829,6 @@ static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) kfree(vcpu->arch.shadow_vcpu); #endif vfree(vcpu_book3s); - kmem_cache_free(kvm_vcpu_cache, vcpu); } static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index be9a45874194..047c9f707704 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -2114,9 +2114,10 @@ int kvmppc_core_init_vm(struct kvm *kvm) return kvm->arch.kvm_ops->init_vm(kvm); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { - return kvm->arch.kvm_ops->vcpu_create(kvm, id); + return kvm->arch.kvm_ops->vcpu_create(kvm, vcpu, id); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 00649ca5fa9a..f5dd2c7adcd4 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -433,26 +433,18 @@ static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, return r; } -static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; - struct kvm_vcpu *vcpu; int err; - BUILD_BUG_ON_MSG(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0, - "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); + vcpu_e500 = to_e500(vcpu); - vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu_e500) { - err = -ENOMEM; - goto out; - } - - vcpu = &vcpu_e500->vcpu; err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_vcpu; + return err; if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) { err = -ENOMEM; @@ -469,7 +461,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, goto uninit_tlb; } - return vcpu; + return 0; uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); @@ -477,10 +469,7 @@ uninit_id: kvmppc_e500_id_table_free(vcpu_e500); uninit_vcpu: kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); -out: - return ERR_PTR(err); + return err; } static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) @@ -491,7 +480,6 @@ static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) kvmppc_e500_tlb_uninit(vcpu_e500); kvmppc_e500_id_table_free(vcpu_e500); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } static int kvmppc_core_init_vm_e500(struct kvm *kvm) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index c51f4bb086fd..7c0d392f667a 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -301,28 +301,21 @@ static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, return r; } -static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu, + unsigned int id) { struct kvmppc_vcpu_e500 *vcpu_e500; - struct kvm_vcpu *vcpu; int err; BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); - - vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu_e500) { - err = -ENOMEM; - goto out; - } - vcpu = &vcpu_e500->vcpu; + vcpu_e500 = to_e500(vcpu); /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ vcpu->arch.oldpir = 0xffffffff; err = kvm_vcpu_init(vcpu, kvm, id); if (err) - goto free_vcpu; + return err; err = kvmppc_e500_tlb_init(vcpu_e500); if (err) @@ -334,17 +327,13 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, goto uninit_tlb; } - return vcpu; + return 0; uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); uninit_vcpu: kvm_vcpu_uninit(vcpu); - -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); -out: - return ERR_PTR(err); + return err; } static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) @@ -354,7 +343,6 @@ static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.shared); kvmppc_e500_tlb_uninit(vcpu_e500); kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 416fb3d2a1d0..fd978f681b66 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -723,12 +723,23 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; - vcpu = kvmppc_core_vcpu_create(kvm, id); - if (!IS_ERR(vcpu)) { - vcpu->arch.wqp = &vcpu->wq; - kvmppc_create_vcpu_debugfs(vcpu, id); - } + int err; + + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) + return ERR_PTR(-ENOMEM); + + err = kvmppc_core_vcpu_create(kvm, vcpu, id); + if (err) + goto free_vcpu; + + vcpu->arch.wqp = &vcpu->wq; + kvmppc_create_vcpu_debugfs(vcpu, id); return vcpu; + +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); + return ERR_PTR(err); } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -758,6 +769,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) } kvmppc_core_vcpu_free(vcpu); + + kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From d30769522294fbbd182659614bda16b5da231413 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:58 -0800 Subject: KVM: PPC: Book3S PR: Allocate book3s and shadow vcpu after common init Call kvm_vcpu_init() in kvmppc_core_vcpu_create_pr() prior to allocating the book3s and shadow_vcpu objects in preparation of moving said call to common PPC code. Although kvm_vcpu_init() has an arch callback, the callback is empty for Book3S PR, i.e. barring unseen black magic, moving the allocation has no real functional impact. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_pr.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 0d7c8a7bcb7b..10c65d412e81 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1748,12 +1748,18 @@ static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, unsigned int id) { struct kvmppc_vcpu_book3s *vcpu_book3s; - int err = -ENOMEM; unsigned long p; + int err; + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + return err; + + err = -ENOMEM; vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); if (!vcpu_book3s) - goto out; + goto uninit_vcpu; vcpu->arch.book3s = vcpu_book3s; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER @@ -1763,14 +1769,9 @@ static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, goto free_vcpu3s; #endif - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_shadow_vcpu; - - err = -ENOMEM; p = __get_free_page(GFP_KERNEL|__GFP_ZERO); if (!p) - goto uninit_vcpu; + goto free_shadow_vcpu; vcpu->arch.shared = (void *)p; #ifdef CONFIG_PPC_BOOK3S_64 /* Always start the shared struct in native endian mode */ @@ -1807,15 +1808,14 @@ static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, free_shared_page: free_page((unsigned long)vcpu->arch.shared); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); free_shadow_vcpu: #ifdef CONFIG_KVM_BOOK3S_32_HANDLER kfree(vcpu->arch.shadow_vcpu); free_vcpu3s: #endif vfree(vcpu_book3s); -out: +uninit_vcpu: + kvm_vcpu_uninit(vcpu); return err; } -- cgit v1.2.3 From 4dbf6fec78868ef02e8bf32834f16b22f58723f5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:54:59 -0800 Subject: KVM: PPC: e500mc: Move reset of oldpir below call to kvm_vcpu_init() Move the initialization of oldpir so that the call to kvm_vcpu_init() is at the top of kvmppc_core_vcpu_create_e500mc(). oldpir is only use when loading/putting a vCPU, which currently cannot be done until after kvm_arch_vcpu_create() completes. Reording the call to kvm_vcpu_init() paves the way for moving the invocation to common PPC code. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500mc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 7c0d392f667a..6c782b8bae0d 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -310,13 +310,13 @@ static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); vcpu_e500 = to_e500(vcpu); - /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ - vcpu->arch.oldpir = 0xffffffff; - err = kvm_vcpu_init(vcpu, kvm, id); if (err) return err; + /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ + vcpu->arch.oldpir = 0xffffffff; + err = kvmppc_e500_tlb_init(vcpu_e500); if (err) goto uninit_vcpu; -- cgit v1.2.3 From ff030fdf55732266c2d35b1a4a0baaf9ce49e9dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:00 -0800 Subject: KVM: PPC: Move kvm_vcpu_init() invocation to common code Move the kvm_cpu_{un}init() calls to common PPC code as an intermediate step towards removing kvm_cpu_{un}init() altogether. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/include/asm/kvm_ppc.h | 6 ++---- arch/powerpc/kvm/book3s.c | 5 ++--- arch/powerpc/kvm/book3s_hv.c | 17 ++++++----------- arch/powerpc/kvm/book3s_pr.c | 13 +++---------- arch/powerpc/kvm/booke.c | 5 ++--- arch/powerpc/kvm/e500.c | 16 +++------------- arch/powerpc/kvm/e500mc.c | 12 ++---------- arch/powerpc/kvm/powerpc.c | 10 +++++++++- 8 files changed, 29 insertions(+), 55 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 8f77ca5ace6f..bc2494e5710a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -119,8 +119,7 @@ extern int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid, enum xlate_readwrite xlrw, struct kvmppc_pte *pte); -extern int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id); +extern int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu); extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu); extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu); extern int kvmppc_core_check_processor_compat(void); @@ -274,8 +273,7 @@ struct kvmppc_ops { void (*inject_interrupt)(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr); int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); - int (*vcpu_create)(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id); + int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); int (*check_requests)(struct kvm_vcpu *vcpu); int (*get_dirty_log)(struct kvm *kvm, struct kvm_dirty_log *log); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 13385656b90d..3f7adcb0ff63 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -789,10 +789,9 @@ void kvmppc_decrementer_func(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } -int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) { - return kvm->arch.kvm_ops->vcpu_create(kvm, vcpu, id); + return vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a14fb6a9ea5d..f4b72cef09d5 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2271,16 +2271,16 @@ static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) } #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ -static int kvmppc_core_vcpu_create_hv(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) { int err; int core; struct kvmppc_vcore *vcore; + struct kvm *kvm; + unsigned int id; - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; + kvm = vcpu->kvm; + id = vcpu->vcpu_id; vcpu->arch.shared = &vcpu->arch.shregs; #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -2362,7 +2362,7 @@ static int kvmppc_core_vcpu_create_hv(struct kvm *kvm, struct kvm_vcpu *vcpu, mutex_unlock(&kvm->lock); if (!vcore) - goto uninit_vcpu; + return err; spin_lock(&vcore->lock); ++vcore->num_threads; @@ -2378,10 +2378,6 @@ static int kvmppc_core_vcpu_create_hv(struct kvm *kvm, struct kvm_vcpu *vcpu, debugfs_vcpu_init(vcpu, id); return 0; - -uninit_vcpu: - kvm_vcpu_uninit(vcpu); - return err; } static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, @@ -2435,7 +2431,6 @@ static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); spin_unlock(&vcpu->arch.vpa_update_lock); - kvm_vcpu_uninit(vcpu); } static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 10c65d412e81..d88f708d5be3 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1744,22 +1744,17 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, return r; } -static int kvmppc_core_vcpu_create_pr(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int kvmppc_core_vcpu_create_pr(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu_book3s; unsigned long p; int err; - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - err = -ENOMEM; vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); if (!vcpu_book3s) - goto uninit_vcpu; + goto out; vcpu->arch.book3s = vcpu_book3s; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER @@ -1814,8 +1809,7 @@ free_shadow_vcpu: free_vcpu3s: #endif vfree(vcpu_book3s); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); +out: return err; } @@ -1824,7 +1818,6 @@ static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); - kvm_vcpu_uninit(vcpu); #ifdef CONFIG_KVM_BOOK3S_32_HANDLER kfree(vcpu->arch.shadow_vcpu); #endif diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 047c9f707704..d41765157f0e 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -2114,10 +2114,9 @@ int kvmppc_core_init_vm(struct kvm *kvm) return kvm->arch.kvm_ops->init_vm(kvm); } -int kvmppc_core_vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) { - return kvm->arch.kvm_ops->vcpu_create(kvm, vcpu, id); + return vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index f5dd2c7adcd4..f2b4feaff6d2 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -433,8 +433,7 @@ static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, return r; } -static int kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int kvmppc_core_vcpu_create_e500(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500; int err; @@ -442,14 +441,8 @@ static int kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu, BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); vcpu_e500 = to_e500(vcpu); - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - - if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) { - err = -ENOMEM; - goto uninit_vcpu; - } + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) + return -ENOMEM; err = kvmppc_e500_tlb_init(vcpu_e500); if (err) @@ -467,8 +460,6 @@ uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); uninit_id: kvmppc_e500_id_table_free(vcpu_e500); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); return err; } @@ -479,7 +470,6 @@ static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.shared); kvmppc_e500_tlb_uninit(vcpu_e500); kvmppc_e500_id_table_free(vcpu_e500); - kvm_vcpu_uninit(vcpu); } static int kvmppc_core_init_vm_e500(struct kvm *kvm) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 6c782b8bae0d..e6b06cb2b92c 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -301,8 +301,7 @@ static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, return r; } -static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu, - unsigned int id) +static int kvmppc_core_vcpu_create_e500mc(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500; int err; @@ -310,16 +309,12 @@ static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); vcpu_e500 = to_e500(vcpu); - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - return err; - /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ vcpu->arch.oldpir = 0xffffffff; err = kvmppc_e500_tlb_init(vcpu_e500); if (err) - goto uninit_vcpu; + return err; vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); if (!vcpu->arch.shared) { @@ -331,8 +326,6 @@ static int kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, struct kvm_vcpu *vcpu uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); return err; } @@ -342,7 +335,6 @@ static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.shared); kvmppc_e500_tlb_uninit(vcpu_e500); - kvm_vcpu_uninit(vcpu); } static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index fd978f681b66..173f57e0a1b5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -729,14 +729,20 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) if (!vcpu) return ERR_PTR(-ENOMEM); - err = kvmppc_core_vcpu_create(kvm, vcpu, id); + err = kvm_vcpu_init(vcpu, kvm, id); if (err) goto free_vcpu; + err = kvmppc_core_vcpu_create(vcpu); + if (err) + goto uninit_vcpu; + vcpu->arch.wqp = &vcpu->wq; kvmppc_create_vcpu_debugfs(vcpu, id); return vcpu; +uninit_vcpu: + kvm_vcpu_uninit(vcpu); free_vcpu: kmem_cache_free(kvm_vcpu_cache, vcpu); return ERR_PTR(err); @@ -770,6 +776,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvmppc_core_vcpu_free(vcpu); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } -- cgit v1.2.3 From 5233009fab8e0a037c38a8c2b28ba4b3df203935 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:01 -0800 Subject: KVM: MIPS: Use kvm_vcpu_cache to allocate vCPUs For reasons unknown, MIPS configures the vCPU allocation cache but allocates vCPUs via kzalloc(). Allocate from the vCPU cache in preparation for moving vCPU allocation to common KVM code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 1109924560d8..5f985773417c 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -286,7 +286,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) void *gebase, *p, *handler, *refill_start, *refill_end; int i; - struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + struct kvm_vcpu *vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) { err = -ENOMEM; @@ -401,7 +401,7 @@ out_uninit_cpu: kvm_vcpu_uninit(vcpu); out_free_cpu: - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); @@ -418,7 +418,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 47d51e5eb5fe0bd7440cf7f1217936e2d85c63cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:02 -0800 Subject: KVM: MIPS: Drop kvm_arch_vcpu_free() Remove the superfluous kvm_arch_vcpu_free() as it is no longer called from commmon KVM code. Note, kvm_arch_vcpu_destroy() *is* called from common code, i.e. choosing which function to whack is not completely arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 5f985773417c..d72bceb10439 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -156,7 +156,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm) struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_arch_vcpu_free(vcpu); + kvm_arch_vcpu_destroy(vcpu); } mutex_lock(&kvm->lock); @@ -407,7 +407,7 @@ out: return ERR_PTR(err); } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { hrtimer_cancel(&vcpu->arch.comparecount_timer); @@ -421,11 +421,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu); } -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_free(vcpu); -} - int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { -- cgit v1.2.3 From d5279f3a882c3c5b57703b602d872d3bc7ffa51d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:03 -0800 Subject: KVM: PPC: Drop kvm_arch_vcpu_free() Remove the superfluous kvm_arch_vcpu_free() as it is no longer called from commmon KVM code. Note, kvm_arch_vcpu_destroy() *is* called from common code, i.e. choosing which function to whack is not completely arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 173f57e0a1b5..a2ba708f5cec 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -475,7 +475,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) #endif kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_free(vcpu); + kvm_arch_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) @@ -752,7 +752,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { /* Make sure we're not using the vcpu anymore */ hrtimer_cancel(&vcpu->arch.dec_timer); @@ -781,11 +781,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu); } -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_free(vcpu); -} - int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return kvmppc_core_pending_dec(vcpu); -- cgit v1.2.3 From 9d979c7e6ff43ca3200ffcb74f57415fd633a2da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:05 -0800 Subject: KVM: x86: Remove spurious kvm_mmu_unload() from vcpu destruction path x86 does not load its MMU until KVM_RUN, which cannot be invoked until after vCPU creation succeeds. Given that kvm_arch_vcpu_destroy() is called if and only if vCPU creation fails, it is impossible for the MMU to be loaded. Note, the bogus kvm_mmu_unload() call was added during an unrelated refactoring of vCPU allocation, i.e. was presumably added as an opportunstic "fix" for a perceived leak. Fixes: fb3f0f51d92d1 ("KVM: Dynamically allocate vcpus") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51292843afcb..b731fc7d0306 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9259,10 +9259,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { vcpu->arch.apf.msr_val = 0; - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); - kvm_arch_vcpu_free(vcpu); } -- cgit v1.2.3 From 208050dac5ef4de5cb83ffcafa78499c94d0b5ad Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:06 -0800 Subject: KVM: x86: Remove spurious clearing of async #PF MSR Remove a bogus clearing of apf.msr_val from kvm_arch_vcpu_destroy(). apf.msr_val is only set to a non-zero value by kvm_pv_enable_async_pf(), which is only reachable by kvm_set_msr_common(), i.e. by writing MSR_KVM_ASYNC_PF_EN. KVM does not autonomously write said MSR, i.e. can only be written via KVM_SET_MSRS or KVM_RUN. Since KVM_SET_MSRS and KVM_RUN are vcpu ioctls, they require a valid vcpu file descriptor. kvm_arch_vcpu_destroy() is only called if KVM_CREATE_VCPU fails, and KVM declares KVM_CREATE_VCPU successful once the vcpu fd is installed and thus visible to userspace. Ergo, apf.msr_val cannot be non-zero when kvm_arch_vcpu_destroy() is called. Fixes: 344d9588a9df0 ("KVM: Add PV MSR to enable asynchronous page faults delivery.") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b731fc7d0306..0c3633f9559d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9257,8 +9257,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - vcpu->arch.apf.msr_val = 0; - kvm_arch_vcpu_free(vcpu); } -- cgit v1.2.3 From 50b143e1b3cfb71c38bdd20dd64c98aa3528117f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:07 -0800 Subject: KVM: x86: Drop kvm_arch_vcpu_free() Remove the superfluous kvm_arch_vcpu_free() as it is no longer called from commmon KVM code. Note, kvm_arch_vcpu_destroy() *is* called from common code, i.e. choosing which function to whack is not completely arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c3633f9559d..8188d6cac588 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9170,20 +9170,6 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) -{ - kvmclock_reset(vcpu); - - kvm_x86_ops->vcpu_free(vcpu); - - kvm_vcpu_uninit(vcpu); - - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); -} - struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { @@ -9257,7 +9243,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - kvm_arch_vcpu_free(vcpu); + kvmclock_reset(vcpu); + + kvm_x86_ops->vcpu_free(vcpu); + + kvm_vcpu_uninit(vcpu); + + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9681,7 +9676,7 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_unload_vcpu_mmu(vcpu); } kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_free(vcpu); + kvm_arch_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) -- cgit v1.2.3 From 897cc38eaab96d006ab17edd0f50a2f432f584cf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:09 -0800 Subject: KVM: Add kvm_arch_vcpu_precreate() to handle pre-allocation issues Add a pre-allocation arch hook to handle checks that are currently done by arch specific code prior to allocating the vCPU object. This paves the way for moving the allocation to common KVM code. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 5 +++++ arch/powerpc/kvm/powerpc.c | 5 +++++ arch/s390/kvm/kvm-s390.c | 12 ++++++++---- arch/x86/kvm/x86.c | 14 +++++++++----- include/linux/kvm_host.h | 1 + virt/kvm/arm/arm.c | 21 +++++++++++---------- virt/kvm/kvm_main.c | 4 ++++ 7 files changed, 43 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index d72bceb10439..2e14455aec4e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -280,6 +280,11 @@ static inline void dump_handler(const char *symbol, void *start, void *end) pr_debug("\tEND(%s)\n", symbol); } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err, size; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a2ba708f5cec..998ef60ac463 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -720,6 +720,11 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, kvmppc_core_flush_memslot(kvm, slot); } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d9e6bf3d54f0..57c6838dff37 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3035,15 +3035,19 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return rc; } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id)) + return -EINVAL; + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; struct sie_page *sie_page; - int rc = -EINVAL; - - if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id)) - goto out; + int rc; rc = -ENOMEM; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8188d6cac588..661e3c40529f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9170,17 +9170,21 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + pr_warn_once("kvm: SMP vm created on host with unstable TSC; " + "guest TSC will not be reliable\n"); + + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; int r; - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) - printk_once(KERN_WARNING - "kvm: SMP vm created on host with unstable TSC; " - "guest TSC will not be reliable\n"); - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu) return ERR_PTR(-ENOMEM); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4f7c8e2f378d..59ac53423361 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -874,6 +874,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index adae134cec59..af3ce2bb370d 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -279,21 +279,22 @@ void kvm_arch_free_vm(struct kvm *kvm) vfree(kvm); } +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) + return -EBUSY; + + if (id >= kvm->arch.max_vcpus) + return -EINVAL; + + return 0; +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err; struct kvm_vcpu *vcpu; - if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) { - err = -EBUSY; - goto out; - } - - if (id >= kvm->arch.max_vcpus) { - err = -EINVAL; - goto out; - } - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) { err = -ENOMEM; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 483c683408a2..7b52207f829c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2728,6 +2728,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) kvm->created_vcpus++; mutex_unlock(&kvm->lock); + r = kvm_arch_vcpu_precreate(kvm, id); + if (r) + goto vcpu_decrement; + vcpu = kvm_arch_vcpu_create(kvm, id); if (IS_ERR(vcpu)) { r = PTR_ERR(vcpu); -- cgit v1.2.3 From 321f8ee559d697d69efa81e8b6d4ea1e487c8bcc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:10 -0800 Subject: KVM: s390: Move guts of kvm_arch_vcpu_init() into kvm_arch_vcpu_create() Move all of kvm_arch_vcpu_init(), which is invoked at the very end of kvm_vcpu_init(), into kvm_arch_vcpu_create() in preparation of moving the call to kvm_vcpu_init(). Moving kvm_vcpu_init() is itself a preparatory step for moving allocation and initialization to common KVM code. No functional change inteded. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 62 ++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 57c6838dff37..0049b621e56a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2705,34 +2705,6 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; - kvm_clear_async_pf_completion_queue(vcpu); - vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | - KVM_SYNC_GPRS | - KVM_SYNC_ACRS | - KVM_SYNC_CRS | - KVM_SYNC_ARCH0 | - KVM_SYNC_PFAULT; - kvm_s390_set_prefix(vcpu, 0); - if (test_kvm_facility(vcpu->kvm, 64)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; - if (test_kvm_facility(vcpu->kvm, 82)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; - if (test_kvm_facility(vcpu->kvm, 133)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; - if (test_kvm_facility(vcpu->kvm, 156)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; - /* fprs can be synchronized via vrs, even if the guest has no vx. With - * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. - */ - if (MACHINE_HAS_VX) - vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; - else - vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; - - if (kvm_is_ucontrol(vcpu->kvm)) - return __kvm_ucontrol_vcpu_init(vcpu); - return 0; } @@ -3077,11 +3049,45 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, rc = kvm_vcpu_init(vcpu, kvm, id); if (rc) goto out_free_sie_block; + + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(vcpu); + vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | + KVM_SYNC_GPRS | + KVM_SYNC_ACRS | + KVM_SYNC_CRS | + KVM_SYNC_ARCH0 | + KVM_SYNC_PFAULT; + kvm_s390_set_prefix(vcpu, 0); + if (test_kvm_facility(vcpu->kvm, 64)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; + if (test_kvm_facility(vcpu->kvm, 82)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; + if (test_kvm_facility(vcpu->kvm, 133)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; + if (test_kvm_facility(vcpu->kvm, 156)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; + /* fprs can be synchronized via vrs, even if the guest has no vx. With + * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. + */ + if (MACHINE_HAS_VX) + vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; + else + vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; + + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = __kvm_ucontrol_vcpu_init(vcpu); + if (rc) + goto out_uninit_vcpu; + } + VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu, vcpu->arch.sie_block); trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); return vcpu; +out_uninit_vcpu: + kvm_vcpu_uninit(vcpu); out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); out_free_cpu: -- cgit v1.2.3 From a2017f17fa175b812ce7de302316f67e8f2b7db0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:11 -0800 Subject: KVM: s390: Invoke kvm_vcpu_init() before allocating sie_page Now that s390's implementation of kvm_arch_vcpu_init() is empty, move the call to kvm_vcpu_init() above the allocation of the sie_page. This paves the way for moving vcpu allocation and initialization into common KVM code without any associated functional change. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0049b621e56a..1f8ba074cbd6 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3027,10 +3027,16 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, if (!vcpu) goto out; + rc = kvm_vcpu_init(vcpu, kvm, id); + if (rc) + goto out_free_cpu; + + rc = -ENOMEM; + BUILD_BUG_ON(sizeof(struct sie_page) != 4096); sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL); if (!sie_page) - goto out_free_cpu; + goto out_uninit_vcpu; vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; @@ -3046,10 +3052,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->gd |= GISA_FORMAT1; seqcount_init(&vcpu->arch.cputm_seqcount); - rc = kvm_vcpu_init(vcpu, kvm, id); - if (rc) - goto out_free_sie_block; - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; kvm_clear_async_pf_completion_queue(vcpu); vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | @@ -3078,7 +3080,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, if (kvm_is_ucontrol(vcpu->kvm)) { rc = __kvm_ucontrol_vcpu_init(vcpu); if (rc) - goto out_uninit_vcpu; + goto out_free_sie_block; } VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu, @@ -3086,10 +3088,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); return vcpu; -out_uninit_vcpu: - kvm_vcpu_uninit(vcpu); out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); +out_uninit_vcpu: + kvm_vcpu_uninit(vcpu); out_free_cpu: kmem_cache_free(kvm_vcpu_cache, vcpu); out: -- cgit v1.2.3 From aaf532c57927982b2523994698f994255a722f5f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:12 -0800 Subject: KVM: MIPS: Invoke kvm_vcpu_uninit() immediately prior to freeing vcpu Move the call to kvm_vcpu_uninit() in kvm_arch_vcpu_destroy() down a few lines so that it is invoked immediately prior to freeing the vCPU. This paves the way for moving the uninit and free sequence to common KVM code without an associated functional change. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 2e14455aec4e..73360e021259 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -416,13 +416,13 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { hrtimer_cancel(&vcpu->arch.comparecount_timer); - kvm_vcpu_uninit(vcpu); - kvm_mips_dump_stats(vcpu); kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); + + kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } -- cgit v1.2.3 From d2423b347de46657c306d4c58f2e08feba4a19c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:13 -0800 Subject: KVM: x86: Invoke kvm_vcpu_uninit() immediately prior to freeing vcpu Move the call to kvm_vcpu_uninit() in kvm_arch_vcpu_destroy() down a few lines so that it is invoked immediately prior to freeing the vCPU. This paves the way for moving the uninit and free sequence to common KVM code without an associated functional change. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 661e3c40529f..335762a17180 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9251,11 +9251,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); - kvm_vcpu_uninit(vcpu); - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + + kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); } -- cgit v1.2.3 From 4543bdc08857e8026475a477e7ba88e461f38271 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:14 -0800 Subject: KVM: Introduce kvm_vcpu_destroy() Add kvm_vcpu_destroy() and wire up all architectures to call the common function instead of their arch specific implementation. The common destruction function will be used by future patches to move allocation and initialization of vCPUs to common KVM code, i.e. to free resources that are allocated by arch agnostic code. No functional change intended. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 2 +- arch/powerpc/kvm/powerpc.c | 2 +- arch/s390/kvm/kvm-s390.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 1 + virt/kvm/arm/arm.c | 2 +- virt/kvm/kvm_main.c | 6 ++++++ 7 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 73360e021259..8546bc6e09e7 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -156,7 +156,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm) struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_arch_vcpu_destroy(vcpu); + kvm_vcpu_destroy(vcpu); } mutex_lock(&kvm->lock); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 998ef60ac463..e3e2b88d3d8b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -475,7 +475,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) #endif kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_destroy(vcpu); + kvm_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1f8ba074cbd6..8543d338a06a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2541,7 +2541,7 @@ static void kvm_free_vcpus(struct kvm *kvm) struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_destroy(vcpu); + kvm_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 335762a17180..42b9149f6b40 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9680,7 +9680,7 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_unload_vcpu_mmu(vcpu); } kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_destroy(vcpu); + kvm_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 59ac53423361..432827ab7623 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -581,6 +581,7 @@ static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); +void kvm_vcpu_destroy(struct kvm_vcpu *vcpu); void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index af3ce2bb370d..0d8fb6973414 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -194,7 +194,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) for (i = 0; i < KVM_MAX_VCPUS; ++i) { if (kvm->vcpus[i]) { - kvm_arch_vcpu_destroy(kvm->vcpus[i]); + kvm_vcpu_destroy(kvm->vcpus[i]); kvm->vcpus[i] = NULL; } } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7b52207f829c..62ba25e44189 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -375,6 +375,12 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); +void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + kvm_arch_vcpu_destroy(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); + #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { -- cgit v1.2.3 From e529ef66e6b53b34f9b8caac55950c8a55c79dac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:15 -0800 Subject: KVM: Move vcpu alloc and init invocation to common code Now that all architectures tightly couple vcpu allocation/free with the mandatory calls to kvm_{un}init_vcpu(), move the sequences verbatim to common KVM code. Move both allocation and initialization in a single patch to eliminate thrash in arch specific code. The bisection benefits of moving the two pieces in separate patches is marginal at best, whereas the odds of introducing a transient arch specific bug are non-zero. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 33 ++++++--------------------------- arch/powerpc/kvm/powerpc.c | 27 ++++----------------------- arch/s390/kvm/kvm-s390.c | 41 ++++++++++------------------------------- arch/x86/kvm/x86.c | 28 ++-------------------------- include/linux/kvm_host.h | 2 +- virt/kvm/arm/arm.c | 29 ++--------------------------- virt/kvm/kvm_main.c | 21 ++++++++++++++++++--- 7 files changed, 43 insertions(+), 138 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 8546bc6e09e7..92c9321b3f95 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -285,25 +285,14 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int err, size; void *gebase, *p, *handler, *refill_start, *refill_end; int i; - struct kvm_vcpu *vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - - if (!vcpu) { - err = -ENOMEM; - goto out; - } - - err = kvm_vcpu_init(vcpu, kvm, id); - - if (err) - goto out_free_cpu; - - kvm_debug("kvm @ %p: create cpu %d at %p\n", kvm, id, vcpu); + kvm_debug("kvm @ %p: create cpu %d at %p\n", + vcpu->kvm, vcpu->vcpu_id, vcpu); /* * Allocate space for host mode exception handlers that handle @@ -318,7 +307,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) if (!gebase) { err = -ENOMEM; - goto out_uninit_cpu; + goto out; } kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n", ALIGN(size, PAGE_SIZE), gebase); @@ -397,19 +386,12 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.last_sched_cpu = -1; vcpu->arch.last_exec_cpu = -1; - return vcpu; + return 0; out_free_gebase: kfree(gebase); - -out_uninit_cpu: - kvm_vcpu_uninit(vcpu); - -out_free_cpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); - out: - return ERR_PTR(err); + return err; } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -421,9 +403,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); - - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e3e2b88d3d8b..fce1b4776e55 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -725,32 +725,17 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu; int err; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - return ERR_PTR(-ENOMEM); - - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_vcpu; - err = kvmppc_core_vcpu_create(vcpu); if (err) - goto uninit_vcpu; + return err; vcpu->arch.wqp = &vcpu->wq; - kvmppc_create_vcpu_debugfs(vcpu, id); - return vcpu; - -uninit_vcpu: - kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); - return ERR_PTR(err); + kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id); + return 0; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -780,10 +765,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) } kvmppc_core_vcpu_free(vcpu); - - kvm_vcpu_uninit(vcpu); - - kmem_cache_free(kvm_vcpu_cache, vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8543d338a06a..9cba1e5d033b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2530,9 +2530,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); free_page((unsigned long)(vcpu->arch.sie_block)); - - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } static void kvm_free_vcpus(struct kvm *kvm) @@ -3014,29 +3011,15 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu; struct sie_page *sie_page; int rc; - rc = -ENOMEM; - - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; - - rc = kvm_vcpu_init(vcpu, kvm, id); - if (rc) - goto out_free_cpu; - - rc = -ENOMEM; - BUILD_BUG_ON(sizeof(struct sie_page) != 4096); sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL); if (!sie_page) - goto out_uninit_vcpu; + return -ENOMEM; vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; @@ -3045,9 +3028,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block->mso = 0; vcpu->arch.sie_block->msl = sclp.hamax; - vcpu->arch.sie_block->icpua = id; + vcpu->arch.sie_block->icpua = vcpu->vcpu_id; spin_lock_init(&vcpu->arch.local_int.lock); - vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin; + vcpu->arch.sie_block->gd = (u32)(u64)vcpu->kvm->arch.gisa_int.origin; if (vcpu->arch.sie_block->gd && sclp.has_gisaf) vcpu->arch.sie_block->gd |= GISA_FORMAT1; seqcount_init(&vcpu->arch.cputm_seqcount); @@ -3083,19 +3066,15 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, goto out_free_sie_block; } - VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu, - vcpu->arch.sie_block); - trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", + vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + + return 0; - return vcpu; out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); -out_uninit_vcpu: - kvm_vcpu_uninit(vcpu); -out_free_cpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(rc); + return rc; } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42b9149f6b40..7bbde6f658bf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9179,30 +9179,9 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu; - int r; - - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); - if (!vcpu) - return ERR_PTR(-ENOMEM); - - r = kvm_vcpu_init(vcpu, kvm, id); - if (r) - goto free_vcpu; - - r = kvm_x86_ops->vcpu_create(vcpu); - if (r) - goto uninit_vcpu; - return vcpu; - -uninit_vcpu: - kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); - return ERR_PTR(r); + return kvm_x86_ops->vcpu_create(vcpu); } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) @@ -9254,9 +9233,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 432827ab7623..405ea07068f1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -876,7 +876,7 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 0d8fb6973414..a7d661fc5683 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -290,32 +290,9 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - int err; - struct kvm_vcpu *vcpu; - - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) { - err = -ENOMEM; - goto out; - } - - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_vcpu; - - err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); - if (err) - goto vcpu_uninit; - - return vcpu; -vcpu_uninit: - kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(err); + return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -330,8 +307,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 62ba25e44189..c84df40518c4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -378,6 +378,9 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { kvm_arch_vcpu_destroy(vcpu); + + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); @@ -2738,12 +2741,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_decrement; - vcpu = kvm_arch_vcpu_create(kvm, id); - if (IS_ERR(vcpu)) { - r = PTR_ERR(vcpu); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) { + r = -ENOMEM; goto vcpu_decrement; } + r = kvm_vcpu_init(vcpu, kvm, id); + if (r) + goto vcpu_free; + + r = kvm_arch_vcpu_create(vcpu); + if (r) + goto vcpu_uninit; + preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); r = kvm_arch_vcpu_setup(vcpu); @@ -2787,6 +2798,10 @@ unlock_vcpu_destroy: debugfs_remove_recursive(vcpu->debugfs_dentry); vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); +vcpu_uninit: + kvm_vcpu_uninit(vcpu); +vcpu_free: + kmem_cache_free(kvm_vcpu_cache, vcpu); vcpu_decrement: mutex_lock(&kvm->lock); kvm->created_vcpus--; -- cgit v1.2.3 From 5f73db112e597b30efb7f81ab5fee87a9febad3e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:18 -0800 Subject: KVM: x86: Move guts of kvm_arch_vcpu_setup() into kvm_arch_vcpu_create() Fold setup() into create() now that the two are called back-to-back by common KVM code. This paves the way for removing kvm_arch_vcpu_setup(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bbde6f658bf..e4a446990306 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9181,11 +9181,12 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->vcpu_create(vcpu); -} + int ret; + + ret = kvm_x86_ops->vcpu_create(vcpu); + if (ret) + return ret; -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_vcpu_mtrr_init(vcpu); @@ -9196,6 +9197,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + return 0; +} + void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { struct msr_data msr; -- cgit v1.2.3 From 5259878432098ffd26cef7294b0a85ab5cfaf556 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:19 -0800 Subject: KVM: MIPS: Move .vcpu_setup() call to kvm_arch_vcpu_create() Fold setup() into create() now that the two are called back-to-back by common KVM code. This paves the way for removing kvm_arch_vcpu_setup(). Note, there is no unwind function associated with kvm_arch_vcpu_setup(), i.e. no teardown path that also needs to be moved. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 92c9321b3f95..b3a4435af66b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -386,8 +386,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.last_sched_cpu = -1; vcpu->arch.last_exec_cpu = -1; + /* Initial guest state */ + err = kvm_mips_callbacks->vcpu_setup(vcpu); + if (err) + goto out_free_commpage; + return 0; +out_free_commpage: + kfree(vcpu->arch.kseg0_commpage); out_free_gebase: kfree(gebase); out: @@ -1237,10 +1244,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return 0; } -/* Initial guest state */ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - return kvm_mips_callbacks->vcpu_setup(vcpu); + return 0; } static void kvm_mips_set_c0_status(void) -- cgit v1.2.3 From ff72bb55cbfd060172cfbafafe4838ce92ab080f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:20 -0800 Subject: KVM: s390: Manually invoke vcpu setup during kvm_arch_vcpu_create() Rename kvm_arch_vcpu_setup() to kvm_s390_vcpu_setup() and manually call the new function during kvm_arch_vcpu_create(). Define an empty kvm_arch_vcpu_setup() as it's still required for compilation. This is effectively a nop as kvm_arch_vcpu_create() and kvm_arch_vcpu_setup() are called back-to-back by common KVM code. Obsoleting kvm_arch_vcpu_setup() paves the way for its removal. Note, gmap_remove() is now called if setup fails, as s390 was previously freeing it via kvm_arch_vcpu_destroy(), which is called by common KVM code if kvm_arch_vcpu_setup() fails. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9cba1e5d033b..dca3d6aac2bb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2932,6 +2932,11 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) { int rc = 0; @@ -3070,8 +3075,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + rc = kvm_s390_vcpu_setup(vcpu); + if (rc) + goto out_ucontrol_uninit; return 0; +out_ucontrol_uninit: + if (kvm_is_ucontrol(vcpu->kvm)) + gmap_remove(vcpu->arch.gmap); out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); return rc; -- cgit v1.2.3 From b3d42c9862e0e2a2d95839d91120ddf8dd8a8af6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:21 -0800 Subject: KVM: PPC: BookE: Setup vcpu during kvmppc_core_vcpu_create() Fold setup() into create() now that the two are called back-to-back by common KVM code. This paves the way for removing kvm_arch_vcpu_setup(). Note, BookE directly implements kvm_arch_vcpu_setup() and PPC's common kvm_arch_vcpu_create() is responsible for its own cleanup, thus the only cleanup required when directly invoking kvmppc_core_vcpu_setup() is to call .vcpu_free(), which is the BookE specific portion of PPC's kvm_arch_vcpu_destroy() by way of kvmppc_core_vcpu_free(). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/booke.c | 60 ++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d41765157f0e..e291c62187fe 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1377,34 +1377,9 @@ static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) update_timer_ints(vcpu); } -/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - int i; - int r; - - vcpu->arch.regs.nip = 0; - vcpu->arch.shared->pir = vcpu->vcpu_id; - kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ - kvmppc_set_msr(vcpu, 0); - -#ifndef CONFIG_KVM_BOOKE_HV - vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; - vcpu->arch.shadow_pid = 1; - vcpu->arch.shared->msr = 0; -#endif - - /* Eye-catching numbers so we know if the guest takes an interrupt - * before it's programmed its own IVPR/IVORs. */ - vcpu->arch.ivpr = 0x55550000; - for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) - vcpu->arch.ivor[i] = 0x7700 | i * 4; - - kvmppc_init_timing_stats(vcpu); - - r = kvmppc_core_vcpu_setup(vcpu); - kvmppc_sanity_check(vcpu); - return r; + return 0; } int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) @@ -2116,7 +2091,38 @@ int kvmppc_core_init_vm(struct kvm *kvm) int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); + int i; + int r; + + r = vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); + if (r) + return r; + + /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ + vcpu->arch.regs.nip = 0; + vcpu->arch.shared->pir = vcpu->vcpu_id; + kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ + kvmppc_set_msr(vcpu, 0); + +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; + vcpu->arch.shadow_pid = 1; + vcpu->arch.shared->msr = 0; +#endif + + /* Eye-catching numbers so we know if the guest takes an interrupt + * before it's programmed its own IVPR/IVORs. */ + vcpu->arch.ivpr = 0x55550000; + for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) + vcpu->arch.ivor[i] = 0x7700 | i * 4; + + kvmppc_init_timing_stats(vcpu); + + r = kvmppc_core_vcpu_setup(vcpu); + if (r) + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); + kvmppc_sanity_check(vcpu); + return r; } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From afede96df55e9cba948c8cc8a682e962244285b4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:22 -0800 Subject: KVM: Drop kvm_arch_vcpu_setup() Remove kvm_arch_vcpu_setup() now that all arch specific implementations are nops. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/arm/kvm/guest.c | 5 ----- arch/arm64/kvm/guest.c | 5 ----- arch/mips/kvm/mips.c | 5 ----- arch/powerpc/kvm/book3s.c | 5 ----- arch/powerpc/kvm/booke.c | 5 ----- arch/s390/kvm/kvm-s390.c | 5 ----- arch/x86/kvm/x86.c | 5 ----- include/linux/kvm_host.h | 1 - virt/kvm/kvm_main.c | 5 ----- 9 files changed, 41 deletions(-) (limited to 'arch') diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 0e6f23504c26..9f7ae0d8690f 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -34,11 +34,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - static u64 core_reg_offset_from_id(u64 id) { return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 2fff06114a8f..2bd92301d32f 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -47,11 +47,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - static bool core_reg_offset_is_vreg(u64 off) { return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) && diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index b3a4435af66b..06366e2415a6 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1244,11 +1244,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return 0; } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - static void kvm_mips_set_c0_status(void) { u32 status = read_c0_status(); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 3f7adcb0ff63..d07a8e12fa15 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -471,11 +471,6 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvmppc_load_last_inst); -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) { return 0; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index e291c62187fe..9cb8257b4118 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1377,11 +1377,6 @@ static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) update_timer_ints(vcpu); } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) { /* setup watchdog timer once */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index dca3d6aac2bb..a1bb47c7ba1e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2931,11 +2931,6 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) { int rc = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e4a446990306..827d5fcba7a5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9197,11 +9197,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { struct msr_data msr; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 521f17cd2b26..87ca40f62b06 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -873,7 +873,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id); int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu); -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1ddb6d4cfbfd..8e9d24442d20 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2753,10 +2753,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r) goto vcpu_uninit; - r = kvm_arch_vcpu_setup(vcpu); - if (r) - goto vcpu_destroy; - kvm_create_vcpu_debugfs(vcpu); mutex_lock(&kvm->lock); @@ -2792,7 +2788,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) unlock_vcpu_destroy: mutex_unlock(&kvm->lock); debugfs_remove_recursive(vcpu->debugfs_dentry); -vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); vcpu_uninit: kvm_vcpu_uninit(vcpu); -- cgit v1.2.3 From 95a0d01eef7a1b97358c25d335c4a28f91345cf9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:23 -0800 Subject: KVM: x86: Move all vcpu init code into kvm_arch_vcpu_create() Fold init() into create() now that the two are called back-to-back by common KVM code (kvm_vcpu_init() calls kvm_arch_vcpu_init() as its last action, and kvm_vm_ioctl_create_vcpu() calls kvm_arch_vcpu_create() immediately thereafter). This paves the way for removing kvm_arch_vcpu_init() entirely. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 198 +++++++++++++++++++++++++++-------------------------- 1 file changed, 100 insertions(+), 98 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 827d5fcba7a5..4469617adfd0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9181,11 +9181,78 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { - int ret; + struct page *page; + int r; - ret = kvm_x86_ops->vcpu_create(vcpu); - if (ret) - return ret; + vcpu->arch.emulate_ctxt.ops = &emulate_ops; + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + + kvm_set_tsc_khz(vcpu, max_tsc_khz); + + r = kvm_mmu_create(vcpu); + if (r < 0) + return r; + + if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); + if (r < 0) + goto fail_mmu_destroy; + } else + static_key_slow_inc(&kvm_no_apic_vcpu); + + r = -ENOMEM; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto fail_free_lapic; + vcpu->arch.pio_data = page_address(page); + + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks) + goto fail_free_pio_data; + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, + GFP_KERNEL_ACCOUNT)) + goto fail_free_mce_banks; + + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { + pr_err("kvm: failed to allocate userspace's fpu\n"); + goto free_wbinvd_dirty_mask; + } + + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { + pr_err("kvm: failed to allocate vcpu's fpu\n"); + goto free_user_fpu; + } + fx_init(vcpu); + + vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + + kvm_async_pf_hash_reset(vcpu); + kvm_pmu_init(vcpu); + + vcpu->arch.pending_external_vector = -1; + vcpu->arch.preempted_in_kernel = false; + + kvm_hv_vcpu_init(vcpu); + + r = kvm_x86_ops->vcpu_create(vcpu); + if (r) + goto free_guest_fpu; vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; @@ -9195,6 +9262,22 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_init_mmu(vcpu, false); vcpu_put(vcpu); return 0; + +free_guest_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); +free_user_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); +fail_free_mce_banks: + kfree(vcpu->arch.mce_banks); +fail_free_pio_data: + free_page((unsigned long)vcpu->arch.pio_data); +fail_free_lapic: + kvm_free_lapic(vcpu); +fail_mmu_destroy: + kvm_mmu_destroy(vcpu); + return r; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -9227,6 +9310,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + int idx; + kvmclock_reset(vcpu); kvm_x86_ops->vcpu_free(vcpu); @@ -9234,6 +9319,17 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + + kvm_hv_vcpu_uninit(vcpu); + kvm_pmu_destroy(vcpu); + kfree(vcpu->arch.mce_banks); + kvm_free_lapic(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_mmu_destroy(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + free_page((unsigned long)vcpu->arch.pio_data); + if (!lapic_in_kernel(vcpu)) + static_key_slow_dec(&kvm_no_apic_vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9481,106 +9577,12 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - struct page *page; - int r; - - vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - - kvm_set_tsc_khz(vcpu, max_tsc_khz); - - r = kvm_mmu_create(vcpu); - if (r < 0) - return r; - - if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); - if (r < 0) - goto fail_mmu_destroy; - } else - static_key_slow_inc(&kvm_no_apic_vcpu); - - r = -ENOMEM; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto fail_free_lapic; - vcpu->arch.pio_data = page_address(page); - - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) - goto fail_free_pio_data; - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, - GFP_KERNEL_ACCOUNT)) - goto fail_free_mce_banks; - - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - pr_err("kvm: failed to allocate userspace's fpu\n"); - goto free_wbinvd_dirty_mask; - } - - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { - pr_err("kvm: failed to allocate vcpu's fpu\n"); - goto free_user_fpu; - } - fx_init(vcpu); - - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - - kvm_async_pf_hash_reset(vcpu); - kvm_pmu_init(vcpu); - - vcpu->arch.pending_external_vector = -1; - vcpu->arch.preempted_in_kernel = false; - - kvm_hv_vcpu_init(vcpu); - return 0; - -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); -free_wbinvd_dirty_mask: - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); -fail_free_mce_banks: - kfree(vcpu->arch.mce_banks); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); -fail_free_lapic: - kvm_free_lapic(vcpu); -fail_mmu_destroy: - kvm_mmu_destroy(vcpu); - return r; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { - int idx; - kvm_hv_vcpu_uninit(vcpu); - kvm_pmu_destroy(vcpu); - kfree(vcpu->arch.mce_banks); - kvm_free_lapic(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_mmu_destroy(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - free_page((unsigned long)vcpu->arch.pio_data); - if (!lapic_in_kernel(vcpu)) - static_key_slow_dec(&kvm_no_apic_vcpu); } void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) -- cgit v1.2.3 From d11dfed5d700b8973d5742300e04b2aaa9d11217 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:24 -0800 Subject: KVM: MIPS: Move all vcpu init code into kvm_arch_vcpu_create() Fold init() into create() now that the two are called back-to-back by common KVM code (kvm_vcpu_init() calls kvm_arch_vcpu_init() as its last action, and kvm_vm_ioctl_create_vcpu() calls kvm_arch_vcpu_create() immediately thereafter). Rinse and repeat for kvm_arch_vcpu_uninit() and kvm_arch_vcpu_destroy(). This paves the way for removing kvm_arch_vcpu_{un}init() entirely. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 06366e2415a6..879a7cbd5b54 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -294,6 +294,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_debug("kvm @ %p: create cpu %d at %p\n", vcpu->kvm, vcpu->vcpu_id, vcpu); + err = kvm_mips_callbacks->vcpu_init(vcpu); + if (err) + return err; + + hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; + /* * Allocate space for host mode exception handlers that handle * guest mode exits @@ -307,7 +315,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (!gebase) { err = -ENOMEM; - goto out; + goto out_uninit_vcpu; } kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n", ALIGN(size, PAGE_SIZE), gebase); @@ -397,7 +405,8 @@ out_free_commpage: kfree(vcpu->arch.kseg0_commpage); out_free_gebase: kfree(gebase); -out: +out_uninit_vcpu: + kvm_mips_callbacks->vcpu_uninit(vcpu); return err; } @@ -410,6 +419,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); + + kvm_mips_callbacks->vcpu_uninit(vcpu); } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, @@ -1221,21 +1232,12 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - int err; - - err = kvm_mips_callbacks->vcpu_init(vcpu); - if (err) - return err; - - hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; return 0; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { - kvm_mips_callbacks->vcpu_uninit(vcpu); + } int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 74ce2e60d4874fc2464e321af1397c6fae984ec9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:26 -0800 Subject: KVM: PPC: Move all vcpu init code into kvm_arch_vcpu_create() Fold init() into create() now that the two are called back-to-back by common KVM code (kvm_vcpu_init() calls kvm_arch_vcpu_init() as its last action, and kvm_vm_ioctl_create_vcpu() calls kvm_arch_vcpu_create() immediately thereafter). Rinse and repeat for kvm_arch_vcpu_uninit() and kvm_arch_vcpu_destroy(). This paves the way for removing kvm_arch_vcpu_{un}init() entirely. Note, calling kvmppc_mmu_destroy() if kvmppc_core_vcpu_create() fails may or may not be necessary. Move it along with the more obvious call to kvmppc_subarch_vcpu_uninit() so as not to inadvertantly introduce a functional change and/or bug. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 56 ++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index fce1b4776e55..4fbf8690b8c5 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -725,17 +725,43 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) return 0; } +static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); + kvmppc_decrementer_func(vcpu); + + return HRTIMER_NORESTART; +} + int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int err; - err = kvmppc_core_vcpu_create(vcpu); + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; + vcpu->arch.dec_expires = get_tb(); + +#ifdef CONFIG_KVM_EXIT_TIMING + mutex_init(&vcpu->arch.exit_timing_lock); +#endif + err = kvmppc_subarch_vcpu_init(vcpu); if (err) return err; + err = kvmppc_core_vcpu_create(vcpu); + if (err) + goto out_vcpu_uninit; + vcpu->arch.wqp = &vcpu->wq; kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id); return 0; + +out_vcpu_uninit: + kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); + return err; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -765,6 +791,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) } kvmppc_core_vcpu_free(vcpu); + + kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) @@ -772,35 +801,14 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); - kvmppc_decrementer_func(vcpu); - - return HRTIMER_NORESTART; -} - int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - int ret; - - hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; - vcpu->arch.dec_expires = get_tb(); - -#ifdef CONFIG_KVM_EXIT_TIMING - mutex_init(&vcpu->arch.exit_timing_lock); -#endif - ret = kvmppc_subarch_vcpu_init(vcpu); - return ret; + return 0; } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { - kvmppc_mmu_destroy(vcpu); - kvmppc_subarch_vcpu_uninit(vcpu); + } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -- cgit v1.2.3 From 19bcc89eb8a9fa1d4be4bff5b5e7917cb8bbc1f7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:27 -0800 Subject: KVM: arm64: Free sve_state via arm specific hook Add an arm specific hook to free the arm64-only sve_state. Doing so eliminates the last functional code from kvm_arch_vcpu_uninit() across all architectures and paves the way for removing kvm_arch_vcpu_init() and kvm_arch_vcpu_uninit() entirely. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/reset.c | 5 +++++ virt/kvm/arm/arm.c | 2 ++ 4 files changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 556cd818eccf..de81dd897a30 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -366,6 +366,7 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +static inline void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c61260cf63c5..6402b2de1844 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -53,6 +53,7 @@ int kvm_arm_init_sve(void); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); +void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index f4a8ae918827..ff3512a0ca97 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -205,6 +205,11 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) } void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + +} + +void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) { kfree(vcpu->arch.sve_state); } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 94616725b97e..937b4c7fb5be 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -326,6 +326,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_caches(vcpu); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); + + kvm_arm_vcpu_destroy(vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From ddd259c9aaba08244dba8877687ee856f79c4f45 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 18 Dec 2019 13:55:28 -0800 Subject: KVM: Drop kvm_arch_vcpu_init() and kvm_arch_vcpu_uninit() Remove kvm_arch_vcpu_init() and kvm_arch_vcpu_uninit() now that all arch specific implementations are nops. Acked-by: Christoffer Dall Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 1 - arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/reset.c | 5 ----- arch/mips/kvm/mips.c | 10 ---------- arch/powerpc/kvm/powerpc.c | 10 ---------- arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kvm/kvm-s390.c | 5 ----- arch/x86/kvm/x86.c | 10 ---------- include/linux/kvm_host.h | 3 --- virt/kvm/arm/arm.c | 5 ----- virt/kvm/kvm_main.c | 16 ++-------------- 11 files changed, 2 insertions(+), 65 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index de81dd897a30..e26cad6d11b3 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -363,7 +363,6 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); static inline bool kvm_arch_requires_vhe(void) { return false; } static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 6402b2de1844..8ab62944e514 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -54,7 +54,6 @@ int kvm_arm_init_sve(void); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index ff3512a0ca97..30b7ea680f66 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -204,11 +204,6 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) return true; } -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - -} - void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) { kfree(vcpu->arch.sve_state); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 879a7cbd5b54..2606f3f02b54 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1230,16 +1230,6 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) return kvm_mips_count_timeout(vcpu); } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - -} - int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4fbf8690b8c5..1af96fb5dc6f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -801,16 +801,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_BOOKE diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 02f4c21c57f6..11ecc4071a29 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -914,7 +914,6 @@ extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc); static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a1bb47c7ba1e..8646c99217f2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2700,11 +2700,6 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - /* needs disabled preemption to protect from TOD sync and vcpu_load/put */ static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4469617adfd0..985066e1bda5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9575,16 +9575,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) struct static_key kvm_no_apic_vcpu __read_mostly; EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - -} - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 87ca40f62b06..a654cf6df078 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -864,9 +864,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run); int kvm_arch_init(void *opaque); void kvm_arch_exit(void); -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu); - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu); void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 937b4c7fb5be..1cfc108eca1e 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -360,11 +360,6 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) preempt_enable(); } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { int *last_ran; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8e9d24442d20..6b496038cd7f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -324,7 +324,6 @@ void kvm_reload_remote_mmus(struct kvm *kvm) static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { struct page *page; - int r; mutex_init(&vcpu->mutex); vcpu->cpu = -1; @@ -338,10 +337,8 @@ static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } + if (!page) + return -ENOMEM; vcpu->run = page_address(page); kvm_vcpu_set_in_spin_loop(vcpu, false); @@ -350,15 +347,7 @@ static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - r = kvm_arch_vcpu_init(vcpu); - if (r < 0) - goto fail_free_run; return 0; - -fail_free_run: - free_page((unsigned long)vcpu->run); -fail: - return r; } static void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) @@ -369,7 +358,6 @@ static void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) * descriptors are already gone. */ put_pid(rcu_dereference_protected(vcpu->pid, 1)); - kvm_arch_vcpu_uninit(vcpu); free_page((unsigned long)vcpu->run); } -- cgit v1.2.3 From a47970ed74a535b1accb4bc73643fd5a93993c3e Mon Sep 17 00:00:00 2001 From: John Allen Date: Thu, 19 Dec 2019 14:17:59 -0600 Subject: kvm/svm: PKU not currently supported Current SVM implementation does not have support for handling PKU. Guests running on a host with future AMD cpus that support the feature will read garbage from the PKRU register and will hit segmentation faults on boot as memory is getting marked as protected that should not be. Ensure that cpuid from SVM does not advertise the feature. Signed-off-by: John Allen Cc: stable@vger.kernel.org Fixes: 0556cbdc2fbc ("x86/pkeys: Don't check if PKRU is zero before writing it") Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 4 +++- arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx/capabilities.h | 5 +++++ arch/x86/kvm/vmx/vmx.c | 1 + 5 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fff9ed6956b5..49751cbd6e63 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1157,6 +1157,7 @@ struct kvm_x86_ops { bool (*xsaves_supported)(void); bool (*umip_emulated)(void); bool (*pt_supported)(void); + bool (*pku_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 74a4d9b4e61f..b1c469446b07 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -353,6 +353,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; unsigned f_la57; + unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0; /* cpuid 7.0.ebx */ const u32 kvm_cpuid_7_0_ebx_x86_features = @@ -364,7 +365,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | + F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; @@ -393,6 +394,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* Set LA57 based on hardware capability. */ entry->ecx |= f_la57; entry->ecx |= f_umip; + entry->ecx |= f_pku; /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 83257a7a2e37..9dbb990c319a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6008,6 +6008,11 @@ static bool svm_has_wbinvd_exit(void) return true; } +static bool svm_pku_supported(void) +{ + return false; +} + #define PRE_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ @@ -7351,6 +7356,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .xsaves_supported = svm_xsaves_supported, .umip_emulated = svm_umip_emulated, .pt_supported = svm_pt_supported, + .pku_supported = svm_pku_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 7aa69716d516..283bdb7071af 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -145,6 +145,11 @@ static inline bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } +static inline bool vmx_pku_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PKU); +} + static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2134726b0442..5415cd40678c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7849,6 +7849,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .xsaves_supported = vmx_xsaves_supported, .umip_emulated = vmx_umip_emulated, .pt_supported = vmx_pt_supported, + .pku_supported = vmx_pku_supported, .request_immediate_exit = vmx_request_immediate_exit, -- cgit v1.2.3 From 3c9053a2cae7ba2ba73766a34cea41baa70f57f7 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:41 -0800 Subject: KVM: x86: Protect x86_decode_insn from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in x86_decode_insn(). kvm_emulate_instruction() (an ancestor of x86_decode_insn()) is an exported symbol, so KVM should treat it conservatively from a security perspective. Fixes: 045a282ca415 ("KVM: emulator: implement fninit, fnstsw, fnstcw") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e9833e345a5c..2d4faefe8dd4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5288,10 +5288,15 @@ done_prefixes: } break; case Escape: - if (ctxt->modrm > 0xbf) - opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; - else + if (ctxt->modrm > 0xbf) { + size_t size = ARRAY_SIZE(opcode.u.esc->high); + u32 index = array_index_nospec( + ctxt->modrm - 0xc0, size); + + opcode = opcode.u.esc->high[index]; + } else { opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; + } break; case InstrDual: if ((ctxt->modrm >> 6) == 3) -- cgit v1.2.3 From 8618793750071d66028584a83ed0b4fa7eb4f607 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:42 -0800 Subject: KVM: x86: Protect kvm_hv_msr_[get|set]_crash_data() from Spectre-v1/L1TF attacks This fixes Spectre-v1/L1TF vulnerabilities in kvm_hv_msr_get_crash_data() and kvm_hv_msr_set_crash_data(). These functions contain index computations that use the (attacker-controlled) MSR number. Fixes: e7d9513b60e8 ("kvm/x86: added hyper-v crash msrs into kvm hyperv context") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b255b9e865e5..4df1c965bf1a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -810,11 +810,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu, u32 index, u64 *pdata) { struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - *pdata = hv->hv_crash_param[index]; + *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; return 0; } @@ -853,11 +854,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, u32 index, u64 data) { struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + size_t size = ARRAY_SIZE(hv->hv_crash_param); - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) + if (WARN_ON_ONCE(index >= size)) return -EINVAL; - hv->hv_crash_param[index] = data; + hv->hv_crash_param[array_index_nospec(index, size)] = data; return 0; } -- cgit v1.2.3 From 14e32321f3606e4b0970200b6e5e47ee6f1e6410 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:43 -0800 Subject: KVM: x86: Refactor picdev_write() to prevent Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in picdev_write(). It replaces index computations based on the (attacked-controlled) port number with constants through a minor refactoring. Fixes: 85f455f7ddbe ("KVM: Add support for in-kernel PIC emulation") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/i8259.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8b38bb4868a6..629a09ca9860 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -460,10 +460,14 @@ static int picdev_write(struct kvm_pic *s, switch (addr) { case 0x20: case 0x21: + pic_lock(s); + pic_ioport_write(&s->pics[0], addr, data); + pic_unlock(s); + break; case 0xa0: case 0xa1: pic_lock(s); - pic_ioport_write(&s->pics[addr >> 7], addr, data); + pic_ioport_write(&s->pics[1], addr, data); pic_unlock(s); break; case 0x4d0: -- cgit v1.2.3 From 8c86405f606ca8508b8d9280680166ca26723695 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:44 -0800 Subject: KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in ioapic_read_indirect(). This function contains index computations based on the (attacker-controlled) IOREGSEL register. Fixes: a2c118bfab8b ("KVM: Fix bounds checking in ioapic indirect register reads (CVE-2013-1798)") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 7312aab33298..c5776febb517 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -68,13 +69,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, default: { u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; - u64 redir_content; + u64 redir_content = ~0ULL; - if (redir_index < IOAPIC_NUM_PINS) - redir_content = - ioapic->redirtbl[redir_index].bits; - else - redir_content = ~0ULL; + if (redir_index < IOAPIC_NUM_PINS) { + u32 index = array_index_nospec( + redir_index, IOAPIC_NUM_PINS); + + redir_content = ioapic->redirtbl[index].bits; + } result = (ioapic->ioregsel & 0x1) ? (redir_content >> 32) & 0xffffffff : -- cgit v1.2.3 From 670564559ca35b439c8d8861fc399451ddf95137 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:45 -0800 Subject: KVM: x86: Protect ioapic_write_indirect() from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in ioapic_write_indirect(). This function contains index computations based on the (attacker-controlled) IOREGSEL register. This patch depends on patch "KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks". Fixes: 70f93dae32ac ("KVM: Use temporary variable to shorten lines.") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index c5776febb517..26aa22cb9b29 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -297,6 +297,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (index >= IOAPIC_NUM_PINS) return; + index = array_index_nospec(index, IOAPIC_NUM_PINS); e = &ioapic->redirtbl[index]; mask_before = e->fields.mask; /* Preserve read-only fields */ -- cgit v1.2.3 From 4bf79cb089f6b1c6c632492c0271054ce52ad766 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:46 -0800 Subject: KVM: x86: Protect kvm_lapic_reg_write() from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in kvm_lapic_reg_write(). This function contains index computations based on the (attacker-controlled) MSR number. Fixes: 0105d1a52640 ("KVM: x2apic interface to lapic") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 88c3c0c6d1e3..865edce27a6a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1960,15 +1960,20 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: - case APIC_LVTERR: + case APIC_LVTERR: { /* TODO: Check vector */ + size_t size; + u32 index; + if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - - val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; + size = ARRAY_SIZE(apic_lvt_mask); + index = array_index_nospec( + (reg - APIC_LVTT) >> 4, size); + val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); - break; + } case APIC_LVTT: if (!kvm_apic_sw_enabled(apic)) -- cgit v1.2.3 From 25a5edea71b7c154b6a0b8cec14c711cafa31d26 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:47 -0800 Subject: KVM: x86: Protect MSR-based index computations in fixed_msr_to_seg_unit() from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in fixed_msr_to_seg_unit(). This function contains index computations based on the (attacker-controlled) MSR number. Fixes: de9aef5e1ad6 ("KVM: MTRR: introduce fixed_mtrr_segment table") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mtrr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 25ce3edd1872..7f0059aa30e1 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -192,11 +192,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) break; case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: *seg = 1; - *unit = msr - MSR_MTRRfix16K_80000; + *unit = array_index_nospec( + msr - MSR_MTRRfix16K_80000, + MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1); break; case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: *seg = 2; - *unit = msr - MSR_MTRRfix4K_C0000; + *unit = array_index_nospec( + msr - MSR_MTRRfix4K_C0000, + MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1); break; default: return false; -- cgit v1.2.3 From 13c5183a4e643cc2b03a22d0e582c8e17bb7457d Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:48 -0800 Subject: KVM: x86: Protect MSR-based index computations in pmu.h from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in the get_gp_pmc() and get_fixed_pmc() functions. They both contain index computations based on the (attacker-controlled) MSR number. Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7ebb62326c14..13332984b6d5 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -2,6 +2,8 @@ #ifndef __KVM_X86_PMU_H #define __KVM_X86_PMU_H +#include + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) @@ -102,8 +104,12 @@ static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, u32 base) { - if (msr >= base && msr < base + pmu->nr_arch_gp_counters) - return &pmu->gp_counters[msr - base]; + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) { + u32 index = array_index_nospec(msr - base, + pmu->nr_arch_gp_counters); + + return &pmu->gp_counters[index]; + } return NULL; } @@ -113,8 +119,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) { int base = MSR_CORE_PERF_FIXED_CTR0; - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) - return &pmu->fixed_counters[msr - base]; + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) { + u32 index = array_index_nospec(msr - base, + pmu->nr_arch_fixed_counters); + + return &pmu->fixed_counters[index]; + } return NULL; } -- cgit v1.2.3 From 6ec4c5eee1750d5d17951c4e1960d953376a0dda Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:49 -0800 Subject: KVM: x86: Protect MSR-based index computations from Spectre-v1/L1TF attacks in x86.c This fixes a Spectre-v1/L1TF vulnerability in set_msr_mce() and get_msr_mce(). Both functions contain index computations based on the (attacker-controlled) MSR number. Fixes: 890ca9aefa78 ("KVM: Add MCE support") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 985066e1bda5..913e55f6dca3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2545,7 +2545,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to * workaround a BIOS/GART TBL issue on AMD K8s, ignore @@ -2986,7 +2989,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; break; } -- cgit v1.2.3 From 125ffc5e0a56a3eded608dc51e09d5ebf72cf652 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:50 -0800 Subject: KVM: x86: Refactor prefix decoding to prevent Spectre-v1/L1TF attacks This fixes Spectre-v1/L1TF vulnerabilities in vmx_read_guest_seg_selector(), vmx_read_guest_seg_base(), vmx_read_guest_seg_limit() and vmx_read_guest_seg_ar(). When invoked from emulation, these functions contain index computations based on the (attacker-influenced) segment value. Using constants prevents the attack. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2d4faefe8dd4..20c0cbdff1be 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5195,16 +5195,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->ad_bytes = def_ad_bytes ^ 6; break; case 0x26: /* ES override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_ES; + break; case 0x2e: /* CS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_CS; + break; case 0x36: /* SS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_SS; + break; case 0x3e: /* DS override */ has_seg_override = true; - ctxt->seg_override = (ctxt->b >> 3) & 3; + ctxt->seg_override = VCPU_SREG_DS; break; case 0x64: /* FS override */ + has_seg_override = true; + ctxt->seg_override = VCPU_SREG_FS; + break; case 0x65: /* GS override */ has_seg_override = true; - ctxt->seg_override = ctxt->b & 7; + ctxt->seg_override = VCPU_SREG_GS; break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) -- cgit v1.2.3 From c926f2f7230b1a29e31914b51db680f8cbf3103f Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:51 -0800 Subject: KVM: x86: Protect exit_reason from being used in Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in vmx_handle_exit(). While exit_reason is set by the hardware and therefore should not be attacker-influenced, an unknown exit_reason could potentially be used to perform such an attack. Fixes: 55d2375e58a6 ("KVM: nVMX: Move nested code to dedicated files") Signed-off-by: Marios Pomonis Signed-off-by: Nick Finco Suggested-by: Sean Christopherson Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 57 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5415cd40678c..62fb639895c2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5913,34 +5913,41 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { kvm_skip_emulated_instruction(vcpu); return 1; - } else if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) { + } + + if (exit_reason >= kvm_vmx_max_exit_handlers) + goto unexpected_vmexit; #ifdef CONFIG_RETPOLINE - if (exit_reason == EXIT_REASON_MSR_WRITE) - return kvm_emulate_wrmsr(vcpu); - else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) - return handle_preemption_timer(vcpu); - else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) - return handle_interrupt_window(vcpu); - else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) - return handle_external_interrupt(vcpu); - else if (exit_reason == EXIT_REASON_HLT) - return kvm_emulate_halt(vcpu); - else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) - return handle_ept_misconfig(vcpu); + if (exit_reason == EXIT_REASON_MSR_WRITE) + return kvm_emulate_wrmsr(vcpu); + else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) + return handle_preemption_timer(vcpu); + else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW) + return handle_interrupt_window(vcpu); + else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + return handle_external_interrupt(vcpu); + else if (exit_reason == EXIT_REASON_HLT) + return kvm_emulate_halt(vcpu); + else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) + return handle_ept_misconfig(vcpu); #endif - return kvm_vmx_exit_handlers[exit_reason](vcpu); - } else { - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", - exit_reason); - dump_vmcs(); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = + + exit_reason = array_index_nospec(exit_reason, + kvm_vmx_max_exit_handlers); + if (!kvm_vmx_exit_handlers[exit_reason]) + goto unexpected_vmexit; + + return kvm_vmx_exit_handlers[exit_reason](vcpu); + +unexpected_vmexit: + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); + dump_vmcs(); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; - vcpu->run->internal.data[0] = exit_reason; - return 0; - } + vcpu->run->internal.ndata = 1; + vcpu->run->internal.data[0] = exit_reason; + return 0; } /* -- cgit v1.2.3 From ea740059ecb37807ba47b84b33d1447435a8d868 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:52 -0800 Subject: KVM: x86: Protect DR-based index computations from Spectre-v1/L1TF attacks This fixes a Spectre-v1/L1TF vulnerability in __kvm_set_dr() and kvm_get_dr(). Both kvm_get_dr() and kvm_set_dr() (a wrapper of __kvm_set_dr()) are exported symbols so KVM should tream them conservatively from a security perspective. Fixes: 020df0794f57 ("KVM: move DR register access handling into generic code") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 913e55f6dca3..780224e76723 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1063,9 +1063,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - vcpu->arch.db[dr] = val; + vcpu->arch.db[array_index_nospec(dr, size)] = val; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; @@ -1102,9 +1104,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr); int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - *val = vcpu->arch.db[dr]; + *val = vcpu->arch.db[array_index_nospec(dr, size)]; break; case 4: /* fall through */ -- cgit v1.2.3 From 66061740f1a487f4ed54fde75e724709f805da53 Mon Sep 17 00:00:00 2001 From: Marios Pomonis Date: Wed, 11 Dec 2019 12:47:53 -0800 Subject: KVM: x86: Protect pmu_intel.c from Spectre-v1/L1TF attacks This fixes Spectre-v1/L1TF vulnerabilities in intel_find_fixed_event() and intel_rdpmc_ecx_to_pmc(). kvm_rdpmc() (ancestor of intel_find_fixed_event()) and reprogram_fixed_counter() (ancestor of intel_rdpmc_ecx_to_pmc()) are exported symbols so KVM should treat them conservatively from a security perspective. Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch") Signed-off-by: Nick Finco Signed-off-by: Marios Pomonis Reviewed-by: Andrew Honig Cc: stable@vger.kernel.org Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 7023138b1cb0..34a3a17bb6d7 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -86,10 +86,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu, static unsigned intel_find_fixed_event(int idx) { - if (idx >= ARRAY_SIZE(fixed_pmc_events)) + u32 event; + size_t size = ARRAY_SIZE(fixed_pmc_events); + + if (idx >= size) return PERF_COUNT_HW_MAX; - return intel_arch_events[fixed_pmc_events[idx]].event_type; + event = fixed_pmc_events[array_index_nospec(idx, size)]; + return intel_arch_events[event].event_type; } /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ @@ -130,16 +134,20 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); struct kvm_pmc *counters; + unsigned int num_counters; idx &= ~(3u << 30); - if (!fixed && idx >= pmu->nr_arch_gp_counters) - return NULL; - if (fixed && idx >= pmu->nr_arch_fixed_counters) + if (fixed) { + counters = pmu->fixed_counters; + num_counters = pmu->nr_arch_fixed_counters; + } else { + counters = pmu->gp_counters; + num_counters = pmu->nr_arch_gp_counters; + } + if (idx >= num_counters) return NULL; - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; - - return &counters[idx]; + return &counters[array_index_nospec(idx, num_counters)]; } static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) -- cgit v1.2.3 From 767b839afa5d62ba9cf859f4e90fef3d4a1780b5 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 18 Jan 2020 10:41:55 +0800 Subject: KVM: x86: avoid clearing pending exception event twice The exception pending event is cleared by kvm_clear_exception_queue(). We shouldn't clear it again. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 780224e76723..2b26400a3410 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9355,7 +9355,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.nmi_injected = false; kvm_clear_interrupt_queue(vcpu); kvm_clear_exception_queue(vcpu); - vcpu->arch.exception.pending = false; memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); -- cgit v1.2.3 From 3ce4dc17e0c1e7280d53abeb85ce851a88868c63 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 18 Jan 2020 10:50:37 +0800 Subject: KVM: apic: short-circuit kvm_apic_accept_pic_intr() when pic intr is accepted Short-circuit kvm_apic_accept_pic_intr() when pic intr is accepted, there is no need to proceed further. Also remove unnecessary var r. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 865edce27a6a..286396c0aa7d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2375,14 +2375,13 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) { u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0); - int r = 0; if (!kvm_apic_hw_enabled(vcpu->arch.apic)) - r = 1; + return 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; - return r; + return 1; + return 0; } void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 3911b65ee1d3e3fc5e3786b1f309dcad0e33f7fd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 18 Jan 2020 22:29:46 +0100 Subject: Revert "KVM: x86: Add a WARN on TIF_NEED_FPU_LOAD in kvm_load_guest_fpu()" This reverts commit 95145c25a78cc0a9d3cbc75708abde432310c5a1. The next few patches will fix the issue so the warning is not needed anymore; revert it separately to simplify application to stable kernels. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b26400a3410..603a1f778dbe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8563,13 +8563,6 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - /* - * Reloading userspace's FPU is handled by kvm_arch_vcpu_load(), both - * for direct calls from userspace (via vcpu_load()) and if this task - * is preempted (via kvm_sched_in()) between vcpu_load() and now. - */ - WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); - copy_fpregs_to_fpstate(vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, -- cgit v1.2.3 From c9aef3b85f425d1f6635382ec210ee5a7ef55d7d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2020 11:30:49 -0800 Subject: KVM: x86: Handle TIF_NEED_FPU_LOAD in kvm_{load,put}_guest_fpu() Handle TIF_NEED_FPU_LOAD similar to how fpu__copy() handles the flag when duplicating FPU state to a new task struct. TIF_NEED_FPU_LOAD can be set any time control is transferred out of KVM, be it voluntarily, e.g. if I/O is triggered during a KVM call to get_user_pages, or involuntarily, e.g. if softirq runs after an IRQ occurs. Therefore, KVM must account for TIF_NEED_FPU_LOAD whenever it is (potentially) accessing CPU FPU state. Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 603a1f778dbe..4b1c9d1c3786 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8558,12 +8558,26 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } +static void kvm_save_current_fpu(struct fpu *fpu) +{ + /* + * If the target FPU state is not resident in the CPU registers, just + * memcpy() from current, else save CPU state directly to the target. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&fpu->state, ¤t->thread.fpu.state, + fpu_kernel_xstate_size); + else + copy_fpregs_to_fpstate(fpu); +} + /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(vcpu->arch.user_fpu); + kvm_save_current_fpu(vcpu->arch.user_fpu); + /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8579,7 +8593,8 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); + kvm_save_current_fpu(vcpu->arch.guest_fpu); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); fpregs_mark_activate(); -- cgit v1.2.3 From a7baead7e312f5a05381d68585fb6dc68e19e90f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2020 11:30:50 -0800 Subject: KVM: x86: Ensure guest's FPU state is loaded when accessing for emulation Lock the FPU regs and reload the current thread's FPU state, which holds the guest's FPU state, to the CPU registers if necessary prior to accessing guest FPU state as part of emulation. kernel_fpu_begin() can be called from softirq context, therefore KVM must ensure softirqs are disabled (locking the FPU regs disables softirqs) when touching CPU FPU state. Note, for all intents and purposes this reverts commit 6ab0b9feb82a7 ("x86,kvm: remove KVM emulator get_fpu / put_fpu"), but at the time it was applied, removing get/put_fpu() was correct. The re-introduction of {get,put}_fpu() is necessitated by the deferring of FPU state load. Fixes: 5f409e20b7945 ("x86/fpu: Defer FPU state load until return to userspace") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 20c0cbdff1be..792ae9588017 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -22,6 +22,7 @@ #include "kvm_cache_regs.h" #include #include +#include #include #include @@ -1075,8 +1076,23 @@ static void fetch_register_operand(struct operand *op) } } +static void emulator_get_fpu(void) +{ + fpregs_lock(); + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); +} + +static void emulator_put_fpu(void) +{ + fpregs_unlock(); +} + static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; @@ -1098,11 +1114,13 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) #endif default: BUG(); } + emulator_put_fpu(); } static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; @@ -1124,10 +1142,12 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, #endif default: BUG(); } + emulator_put_fpu(); } static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; @@ -1139,10 +1159,12 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; default: BUG(); } + emulator_put_fpu(); } static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) { + emulator_get_fpu(); switch (reg) { case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; @@ -1154,6 +1176,7 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; default: BUG(); } + emulator_put_fpu(); } static int em_fninit(struct x86_emulate_ctxt *ctxt) @@ -1161,7 +1184,9 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); + emulator_get_fpu(); asm volatile("fninit"); + emulator_put_fpu(); return X86EMUL_CONTINUE; } @@ -1172,7 +1197,9 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); + emulator_get_fpu(); asm volatile("fnstcw %0": "+m"(fcw)); + emulator_put_fpu(); ctxt->dst.val = fcw; @@ -1186,7 +1213,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); + emulator_get_fpu(); asm volatile("fnstsw %0": "+m"(fsw)); + emulator_put_fpu(); ctxt->dst.val = fsw; @@ -4077,8 +4106,12 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; + emulator_get_fpu(); + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + emulator_put_fpu(); + if (rc != X86EMUL_CONTINUE) return rc; @@ -4121,6 +4154,8 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; + emulator_get_fpu(); + if (size < __fxstate_size(16)) { rc = fxregs_fixup(&fx_state, size); if (rc != X86EMUL_CONTINUE) @@ -4136,6 +4171,8 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); out: + emulator_put_fpu(); + return rc; } @@ -5450,7 +5487,9 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { int rc; + emulator_get_fpu(); rc = asm_safe("fwait"); + emulator_put_fpu(); if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); -- cgit v1.2.3 From 2620fe268e80d667a94553cd37a94ccaa2cb8c83 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2020 11:30:51 -0800 Subject: KVM: x86: Revert "KVM: X86: Fix fpu state crash in kvm guest" Reload the current thread's FPU state, which contains the guest's FPU state, to the CPU registers if necessary during vcpu_enter_guest(). TIF_NEED_FPU_LOAD can be set any time control is transferred out of KVM, e.g. if I/O is triggered during a KVM call to get_user_pages() or if a softirq occurs while KVM is scheduled in. Moving the handling of TIF_NEED_FPU_LOAD from vcpu_enter_guest() to kvm_arch_vcpu_load(), effectively kvm_sched_in(), papered over a bug where kvm_put_guest_fpu() failed to account for TIF_NEED_FPU_LOAD. The easiest way to the kvm_put_guest_fpu() bug was to run with involuntary preemption enable, thus handling TIF_NEED_FPU_LOAD during kvm_sched_in() made the bug go away. But, removing the handling in vcpu_enter_guest() exposed KVM to the rare case of a softirq triggering kernel_fpu_begin() between vcpu_load() and vcpu_enter_guest(). Now that kvm_{load,put}_guest_fpu() correctly handle TIF_NEED_FPU_LOAD, revert the commit to both restore the vcpu_enter_guest() behavior and eliminate the superfluous switch_fpu_return() in kvm_arch_vcpu_load(). Note, leaving the handling in kvm_arch_vcpu_load() isn't wrong per se, but it is unnecessary, and most critically, makes it extremely difficult to find bugs such as the kvm_put_guest_fpu() issue due to shrinking the window where a softirq can corrupt state. A sample trace triggered by warning if TIF_NEED_FPU_LOAD is set while vcpu state is loaded: gcmaes_crypt_by_sg.constprop.12+0x26e/0x660 ? 0xffffffffc024547d ? __qdisc_run+0x83/0x510 ? __dev_queue_xmit+0x45e/0x990 ? ip_finish_output2+0x1a8/0x570 ? fib4_rule_action+0x61/0x70 ? fib4_rule_action+0x70/0x70 ? fib_rules_lookup+0x13f/0x1c0 ? helper_rfc4106_decrypt+0x82/0xa0 ? crypto_aead_decrypt+0x40/0x70 ? crypto_aead_decrypt+0x40/0x70 ? crypto_aead_decrypt+0x40/0x70 ? esp_output_tail+0x8f4/0xa5a [esp4] ? skb_ext_add+0xd3/0x170 ? xfrm_input+0x7a6/0x12c0 ? xfrm4_rcv_encap+0xae/0xd0 ? xfrm4_transport_finish+0x200/0x200 ? udp_queue_rcv_one_skb+0x1ba/0x460 ? udp_unicast_rcv_skb.isra.63+0x72/0x90 ? __udp4_lib_rcv+0x51b/0xb00 ? ip_protocol_deliver_rcu+0xd2/0x1c0 ? ip_local_deliver_finish+0x44/0x50 ? ip_local_deliver+0xe0/0xf0 ? ip_protocol_deliver_rcu+0x1c0/0x1c0 ? ip_rcv+0xbc/0xd0 ? ip_rcv_finish_core.isra.19+0x380/0x380 ? __netif_receive_skb_one_core+0x7e/0x90 ? netif_receive_skb_internal+0x3d/0xb0 ? napi_gro_receive+0xed/0x150 ? 0xffffffffc0243c77 ? net_rx_action+0x149/0x3b0 ? __do_softirq+0xe4/0x2f8 ? handle_irq_event_percpu+0x6a/0x80 ? irq_exit+0xe6/0xf0 ? do_IRQ+0x7f/0xd0 ? common_interrupt+0xf/0xf ? irq_entries_start+0x20/0x660 ? vmx_get_interrupt_shadow+0x2f0/0x710 [kvm_intel] ? kvm_set_msr_common+0xfc7/0x2380 [kvm] ? recalibrate_cpu_khz+0x10/0x10 ? ktime_get+0x3a/0xa0 ? kvm_arch_vcpu_ioctl_run+0x107/0x560 [kvm] ? kvm_init+0x6bf/0xd00 [kvm] ? __seccomp_filter+0x7a/0x680 ? do_vfs_ioctl+0xa4/0x630 ? security_file_ioctl+0x32/0x50 ? ksys_ioctl+0x60/0x90 ? __x64_sys_ioctl+0x16/0x20 ? do_syscall_64+0x5f/0x1a0 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace 9564a1ccad733a90 ]--- This reverts commit e751732486eb3f159089a64d1901992b1357e7cc. Fixes: e751732486eb3 ("KVM: X86: Fix fpu state crash in kvm guest") Reported-by: Derek Yerger Reported-by: kernel@najdan.com Cc: Wanpeng Li Cc: Thomas Lambertz Cc: Rik van Riel Cc: Sebastian Andrzej Siewior Cc: Borislav Petkov Cc: Dave Hansen Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b1c9d1c3786..ec8f05defd54 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3528,10 +3528,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); - fpregs_assert_state_consistent(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_return(); - /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); @@ -8285,8 +8281,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) trace_kvm_entry(vcpu->vcpu_id); guest_enter_irqoff(); - /* The preempt notifier should have taken care of the FPU already. */ - WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); -- cgit v1.2.3 From c0a21c3f9d9b21a4bd67a46d96eaeb372b0daf20 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2020 11:30:52 -0800 Subject: KVM: x86: Remove unused ctxt param from emulator's FPU accessors Remove an unused struct x86_emulate_ctxt * param from low level helpers used to access guest FPU state. The unused param was left behind by commit 6ab0b9feb82a ("x86,kvm: remove KVM emulator get_fpu / put_fpu"). No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 792ae9588017..c7a0da45f60a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1090,7 +1090,7 @@ static void emulator_put_fpu(void) fpregs_unlock(); } -static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) +static void read_sse_reg(sse128_t *data, int reg) { emulator_get_fpu(); switch (reg) { @@ -1117,8 +1117,7 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) emulator_put_fpu(); } -static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, - int reg) +static void write_sse_reg(sse128_t *data, int reg) { emulator_get_fpu(); switch (reg) { @@ -1145,7 +1144,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, emulator_put_fpu(); } -static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +static void read_mmx_reg(u64 *data, int reg) { emulator_get_fpu(); switch (reg) { @@ -1162,7 +1161,7 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) emulator_put_fpu(); } -static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +static void write_mmx_reg(u64 *data, int reg) { emulator_get_fpu(); switch (reg) { @@ -1234,7 +1233,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; - read_sse_reg(ctxt, &op->vec_val, reg); + read_sse_reg(&op->vec_val, reg); return; } if (ctxt->d & Mmx) { @@ -1285,7 +1284,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = ctxt->modrm_rm; - read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); + read_sse_reg(&op->vec_val, ctxt->modrm_rm); return rc; } if (ctxt->d & Mmx) { @@ -1862,10 +1861,10 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->bytes * op->count); break; case OP_XMM: - write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); + write_sse_reg(&op->vec_val, op->addr.xmm); break; case OP_MM: - write_mmx_reg(ctxt, &op->mm_val, op->addr.mm); + write_mmx_reg(&op->mm_val, op->addr.mm); break; case OP_NONE: /* no writeback */ @@ -5497,11 +5496,10 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op) +static void fetch_possible_mmx_operand(struct operand *op) { if (op->type == OP_MM) - read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); + read_mmx_reg(&op->mm_val, op->addr.mm); } static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) @@ -5580,10 +5578,10 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) * Now that we know the fpu is exception safe, we can fetch * operands from it. */ - fetch_possible_mmx_operand(ctxt, &ctxt->src); - fetch_possible_mmx_operand(ctxt, &ctxt->src2); + fetch_possible_mmx_operand(&ctxt->src); + fetch_possible_mmx_operand(&ctxt->src2); if (!(ctxt->d & Mov)) - fetch_possible_mmx_operand(ctxt, &ctxt->dst); + fetch_possible_mmx_operand(&ctxt->dst); } if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { -- cgit v1.2.3 From d196842150e046a523f75785438137a9243c9627 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 13 Dec 2019 16:33:58 -0800 Subject: KVM: nVMX: WARN on failure to set IA32_PERF_GLOBAL_CTRL Writes to MSR_CORE_PERF_GLOBAL_CONTROL should never fail if the VM-exit and VM-entry controls are exposed to L1. Promote the checks to perform a full WARN if kvm_set_msr() fails and remove the now unused macro SET_MSR_OR_WARN(). Suggested-by: Sean Christopherson Cc: Paolo Bonzini Signed-off-by: Oliver Upton Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ef2d53854d15..2f2d49992d03 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -28,16 +28,6 @@ module_param(nested_early_check, bool, S_IRUGO); failed; \ }) -#define SET_MSR_OR_WARN(vcpu, idx, data) \ -({ \ - bool failed = kvm_set_msr(vcpu, idx, data); \ - if (failed) \ - pr_warn_ratelimited( \ - "%s cannot write MSR (0x%x, 0x%llx)\n", \ - __func__, idx, data); \ - failed; \ -}) - /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. @@ -2550,8 +2540,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl)) + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl))) return -EINVAL; kvm_rsp_write(vcpu, vmcs12->guest_rsp); @@ -3996,8 +3986,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl); + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl)); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ -- cgit v1.2.3 From de761ea792c8344b7170624dbbcba4d730e23897 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Jan 2020 10:36:05 -0800 Subject: KVM: x86: Perform non-canonical checks in 32-bit KVM Remove the CONFIG_X86_64 condition from the low level non-canonical helpers to effectively enable non-canonical checks on 32-bit KVM. Non-canonical checks are performed by hardware if the CPU *supports* 64-bit mode, whether or not the CPU is actually in 64-bit mode is irrelevant. For the most part, skipping non-canonical checks on 32-bit KVM is ok-ish because 32-bit KVM always (hopefully) drops bits 63:32 of whatever value it's checking before propagating it to hardware, and architecturally, the expected behavior for the guest is a bit of a grey area since the vCPU itself doesn't support 64-bit mode. I.e. a 32-bit KVM guest can observe the missed checks in several paths, e.g. INVVPID and VM-Enter, but it's debatable whether or not the missed checks constitute a bug because technically the vCPU doesn't support 64-bit mode. The primary motivation for enabling the non-canonical checks is defense in depth. As mentioned above, a guest can trigger a missed check via INVVPID or VM-Enter. INVVPID is straightforward as it takes a 64-bit virtual address as part of its 128-bit INVVPID descriptor and fails if the address is non-canonical, even if INVVPID is executed in 32-bit PM. Nested VM-Enter is a bit more convoluted as it requires the guest to write natural width VMCS fields via memory accesses and then VMPTRLD the VMCS, but it's still possible. In both cases, KVM is saved from a true bug only because its flows that propagate values to hardware (correctly) take "unsigned long" parameters and so drop bits 63:32 of the bad value. Explicitly performing the non-canonical checks makes it less likely that a bad value will be propagated to hardware, e.g. in the INVVPID case, if __invvpid() didn't implicitly drop bits 63:32 then KVM would BUG() on the resulting unexpected INVVPID failure due to hardware rejecting the non-canonical address. The only downside to enabling the non-canonical checks is that it adds a relatively small amount of overhead, but the affected flows are not hot paths, i.e. the overhead is negligible. Note, KVM technically could gate the non-canonical checks on 32-bit KVM with static_cpu_has(X86_FEATURE_LM), but on bare metal that's an even bigger waste of code for everyone except the 0.00000000000001% of the population running on Yonah, and nested 32-bit on 64-bit already fudges things with respect to 64-bit CPU behavior. Signed-off-by: Sean Christopherson [Also do so in nested_vmx_check_host_state as reported by Krish. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 -- arch/x86/kvm/x86.h | 8 -------- 2 files changed, 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2f2d49992d03..53ea65070b5a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2813,7 +2813,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; -#ifdef CONFIG_X86_64 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || @@ -2821,7 +2820,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) return -EINVAL; -#endif /* * If the load IA32_EFER VM-exit control is 1, bits reserved in the diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index dd6e34d0a881..e007b61b932a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -161,21 +161,13 @@ static inline u64 get_canonical(u64 la, u8 vaddr_bits) static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) { -#ifdef CONFIG_X86_64 return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la; -#else - return false; -#endif } static inline bool emul_is_noncanonical_address(u64 la, struct x86_emulate_ctxt *ctxt) { -#ifdef CONFIG_X86_64 return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la; -#else - return false; -#endif } static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From de9bf4d4cea369355440c5afbbbc7e62302de8fd Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 15 Jan 2020 18:10:12 +0100 Subject: x86/kvm/hyper-v: remove stale evmcs_already_enabled check from nested_enable_evmcs() In nested_enable_evmcs() evmcs_already_enabled check doesn't really do anything: controls are already sanitized and we return '0' regardless. Just drop the check. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 72359709cdc1..89c3e0caf39f 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -350,17 +350,12 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { struct vcpu_vmx *vmx = to_vmx(vcpu); - bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled; vmx->nested.enlightened_vmcs_enabled = true; if (vmcs_version) *vmcs_version = nested_get_evmcs_version(vcpu); - /* We don't support disabling the feature for simplicity. */ - if (evmcs_already_enabled) - return 0; - vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; -- cgit v1.2.3 From 2a5755bb21ee2f6567ce3eb2af10e8948df047d6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Jan 2020 09:57:14 -0500 Subject: KVM: X86: Don't take srcu lock in init_rmode_identity_map() We've already got the slots_lock, so we should be safe. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 62fb639895c2..1d486e8eb4ef 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3483,7 +3483,7 @@ out: static int init_rmode_identity_map(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); - int i, idx, r = 0; + int i, r = 0; kvm_pfn_t identity_map_pfn; u32 tmp; @@ -3491,7 +3491,7 @@ static int init_rmode_identity_map(struct kvm *kvm) mutex_lock(&kvm->slots_lock); if (likely(kvm_vmx->ept_identity_pagetable_done)) - goto out2; + goto out; if (!kvm_vmx->ept_identity_map_addr) kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; @@ -3500,9 +3500,8 @@ static int init_rmode_identity_map(struct kvm *kvm) r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, kvm_vmx->ept_identity_map_addr, PAGE_SIZE); if (r < 0) - goto out2; + goto out; - idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -3518,9 +3517,6 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm_vmx->ept_identity_pagetable_done = true; out: - srcu_read_unlock(&kvm->srcu, idx); - -out2: mutex_unlock(&kvm->slots_lock); return r; } -- cgit v1.2.3 From 6a3c623ba8a842f895e80a7fa0feb94b7b4368f2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 9 Jan 2020 09:57:16 -0500 Subject: KVM: X86: Drop x86_set_memory_region() The helper x86_set_memory_region() is only used in vmx_set_tss_addr() and kvm_arch_destroy_vm(). Push the lock upper in both cases. With that, drop x86_set_memory_region(). This prepares to allow __x86_set_memory_region() to return a HVA mapped, because the HVA will need to be protected by the lock too even after __x86_set_memory_region() returns. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/vmx/vmx.c | 7 +++++-- arch/x86/kvm/x86.c | 22 +++++++--------------- 3 files changed, 12 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49751cbd6e63..69e31dbdfdc2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1627,7 +1627,6 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); int kvm_is_in_guest(void); int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); -int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1d486e8eb4ef..5087bd7062f0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4491,8 +4491,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (enable_unrestricted_guest) return 0; - ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, - PAGE_SIZE * 3); + mutex_lock(&kvm->slots_lock); + ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, + PAGE_SIZE * 3); + mutex_unlock(&kvm->slots_lock); + if (ret) return ret; to_kvm_vmx(kvm)->tss_addr = addr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec8f05defd54..48cd4e191b9c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9732,18 +9732,6 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(__x86_set_memory_region); -int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) -{ - int r; - - mutex_lock(&kvm->slots_lock); - r = __x86_set_memory_region(kvm, id, gpa, size); - mutex_unlock(&kvm->slots_lock); - - return r; -} -EXPORT_SYMBOL_GPL(x86_set_memory_region); - void kvm_arch_pre_destroy_vm(struct kvm *kvm) { kvm_mmu_pre_destroy_vm(kvm); @@ -9757,9 +9745,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm) * unless the the memory map has changed due to process exit * or fd copying. */ - x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); - x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); - x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); + mutex_lock(&kvm->slots_lock); + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + 0, 0); + __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + 0, 0); + __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); + mutex_unlock(&kvm->slots_lock); } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); -- cgit v1.2.3 From 7495e22bb165e7030bae4d9c6e84addb5ea17b29 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Jan 2020 09:57:19 -0500 Subject: KVM: Move running VCPU from ARM to common code For ring-based dirty log tracking, it will be more efficient to account writes during schedule-out or schedule-in to the currently running VCPU. We would like to do it even if the write doesn't use the current VCPU's address space, as is the case for cached writes (see commit 4e335d9e7ddb, "Revert "KVM: Support vCPU-based gfn->hva cache"", 2017-05-02). Therefore, add a mechanism to track the currently-loaded kvm_vcpu struct. There is already something similar in KVM/ARM; one important difference is that kvm_arch_vcpu_{load,put} have two callers in virt/kvm/kvm_main.c: we have to update both the architecture-independent vcpu_{load,put} and the preempt notifiers. Another change made in the process is to allow using kvm_get_running_vcpu() in preemptible code. This is allowed because preempt notifiers ensure that the value does not change even after the VCPU thread is migrated. Signed-off-by: Paolo Bonzini Reviewed-by: Paolo Bonzini Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 2 -- arch/arm64/include/asm/kvm_host.h | 2 -- include/linux/kvm_host.h | 3 +++ virt/kvm/arm/arch_timer.c | 2 +- virt/kvm/arm/arm.c | 29 ----------------------------- virt/kvm/arm/perf.c | 6 +++--- virt/kvm/arm/vgic/vgic-mmio.c | 15 +++------------ virt/kvm/kvm_main.c | 25 ++++++++++++++++++++++++- 8 files changed, 34 insertions(+), 50 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index e26cad6d11b3..421594892304 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -284,8 +284,6 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -struct kvm_vcpu *kvm_arm_get_running_vcpu(void); -struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8ab62944e514..eb992eaa4165 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -446,8 +446,6 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -struct kvm_vcpu *kvm_arm_get_running_vcpu(void); -struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 83bd60f0af01..48e139c293c2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1335,6 +1335,9 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) } #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ +struct kvm_vcpu *kvm_get_running_vcpu(void); +struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); + #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS bool kvm_arch_has_irq_bypass(void); int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *, diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index f182b2380345..63dd6f27997c 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -1022,7 +1022,7 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) bool kvm_arch_timer_get_input_level(int vintid) { - struct kvm_vcpu *vcpu = kvm_arm_get_running_vcpu(); + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); struct arch_timer_context *timer; if (vintid == vcpu_vtimer(vcpu)->irq.irq) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 1cfc108eca1e..3ff510599af6 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -51,9 +51,6 @@ __asm__(".arch_extension virt"); DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); -/* Per-CPU variable containing the currently running vcpu. */ -static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); - /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; @@ -62,31 +59,8 @@ static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); - -static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) -{ - __this_cpu_write(kvm_arm_running_vcpu, vcpu); -} - DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); -/** - * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU. - * Must be called from non-preemptible context - */ -struct kvm_vcpu *kvm_arm_get_running_vcpu(void) -{ - return __this_cpu_read(kvm_arm_running_vcpu); -} - -/** - * kvm_arm_get_running_vcpus - get the per-CPU array of currently running vcpus. - */ -struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) -{ - return &kvm_arm_running_vcpu; -} - int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; @@ -380,7 +354,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->cpu = cpu; vcpu->arch.host_cpu_context = &cpu_data->host_ctxt; - kvm_arm_set_running_vcpu(vcpu); kvm_vgic_load(vcpu); kvm_timer_vcpu_load(vcpu); kvm_vcpu_load_sysregs(vcpu); @@ -406,8 +379,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_vcpu_pmu_restore_host(vcpu); vcpu->cpu = -1; - - kvm_arm_set_running_vcpu(NULL); } static void vcpu_power_off(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/perf.c b/virt/kvm/arm/perf.c index 918cdc3839ea..d45b8b9a4415 100644 --- a/virt/kvm/arm/perf.c +++ b/virt/kvm/arm/perf.c @@ -13,14 +13,14 @@ static int kvm_is_in_guest(void) { - return kvm_arm_get_running_vcpu() != NULL; + return kvm_get_running_vcpu() != NULL; } static int kvm_is_user_mode(void) { struct kvm_vcpu *vcpu; - vcpu = kvm_arm_get_running_vcpu(); + vcpu = kvm_get_running_vcpu(); if (vcpu) return !vcpu_mode_priv(vcpu); @@ -32,7 +32,7 @@ static unsigned long kvm_get_guest_ip(void) { struct kvm_vcpu *vcpu; - vcpu = kvm_arm_get_running_vcpu(); + vcpu = kvm_get_running_vcpu(); if (vcpu) return *vcpu_pc(vcpu); diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 0d090482720d..d656ebd5f9d4 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -190,15 +190,6 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, * value later will give us the same value as we update the per-CPU variable * in the preempt notifier handlers. */ -static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void) -{ - struct kvm_vcpu *vcpu; - - preempt_disable(); - vcpu = kvm_arm_get_running_vcpu(); - preempt_enable(); - return vcpu; -} /* Must be called with irq->irq_lock held */ static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, @@ -221,7 +212,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - bool is_uaccess = !vgic_get_mmio_requester_vcpu(); + bool is_uaccess = !kvm_get_running_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; @@ -274,7 +265,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { - bool is_uaccess = !vgic_get_mmio_requester_vcpu(); + bool is_uaccess = !kvm_get_running_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; unsigned long flags; @@ -335,7 +326,7 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool active) { unsigned long flags; - struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu(); + struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu(); raw_spin_lock_irqsave(&irq->irq_lock, flags); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4f3ac8b753b6..7837fd524296 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -107,6 +107,7 @@ static atomic_t hardware_enable_failed; static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; +static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); struct dentry *kvm_debugfs_dir; EXPORT_SYMBOL_GPL(kvm_debugfs_dir); @@ -196,6 +197,8 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) void vcpu_load(struct kvm_vcpu *vcpu) { int cpu = get_cpu(); + + __this_cpu_write(kvm_running_vcpu, vcpu); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); @@ -207,6 +210,7 @@ void vcpu_put(struct kvm_vcpu *vcpu) preempt_disable(); kvm_arch_vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); + __this_cpu_write(kvm_running_vcpu, NULL); preempt_enable(); } EXPORT_SYMBOL_GPL(vcpu_put); @@ -4288,8 +4292,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) WRITE_ONCE(vcpu->preempted, false); WRITE_ONCE(vcpu->ready, false); + __this_cpu_write(kvm_running_vcpu, vcpu); kvm_arch_sched_in(vcpu, cpu); - kvm_arch_vcpu_load(vcpu, cpu); } @@ -4303,6 +4307,25 @@ static void kvm_sched_out(struct preempt_notifier *pn, WRITE_ONCE(vcpu->ready, true); } kvm_arch_vcpu_put(vcpu); + __this_cpu_write(kvm_running_vcpu, NULL); +} + +/** + * kvm_get_running_vcpu - get the vcpu running on the current CPU. + * Thanks to preempt notifiers, this can also be called from + * preemptible context. + */ +struct kvm_vcpu *kvm_get_running_vcpu(void) +{ + return __this_cpu_read(kvm_running_vcpu); +} + +/** + * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. + */ +struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) +{ + return &kvm_running_vcpu; } static void check_processor_compat(void *rtn) -- cgit v1.2.3 From b91991bf6b707482953c094dbd9615f6382ba2cb Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 15 Jan 2020 19:54:32 -0500 Subject: KVM: nVMX: Check GUEST_DR7 on vmentry of nested guests According to section "Checks on Guest Control Registers, Debug Registers, and and MSRs" in Intel SDM vol 3C, the following checks are performed on vmentry of nested guests: If the "load debug controls" VM-entry control is 1, bits 63:32 in the DR7 field must be 0. In KVM, GUEST_DR7 is set prior to the vmcs02 VM-entry by kvm_set_dr() and the latter synthesizes a #GP if any bit in the high dword in the former is set. Hence this field needs to be checked in software. Signed-off-by: Krish Sadhukhan Reviewed-by: Karl Heubaum Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++++ arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86.h | 6 ++++++ 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 53ea65070b5a..95b3f4306ac2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2887,6 +2887,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && + CC(!kvm_dr7_valid(vmcs12->guest_dr7))) + return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) return -EINVAL; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 48cd4e191b9c..baf89d4bc653 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1082,7 +1082,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 5: /* fall through */ default: /* 7 */ - if (val & 0xffffffff00000000ULL) + if (!kvm_dr7_valid(val)) return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; kvm_update_dr7(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e007b61b932a..2d2ff855773b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -357,6 +357,12 @@ static inline bool kvm_pat_valid(u64 data) return (data | ((data & 0x0202020202020202ull) << 1)) == data; } +static inline bool kvm_dr7_valid(unsigned long data) +{ + /* Bits [63:32] are reserved */ + return !(data >> 32); +} + void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 17ac43a894ed3dd237d6def00c5ec2c7d975567e Mon Sep 17 00:00:00 2001 From: Haiwei Li Date: Thu, 16 Jan 2020 16:50:21 +0800 Subject: Adding 'else' to reduce checking. These two conditions are in conflict, adding 'else' to reduce checking. Signed-off-by: Haiwei Li Reviewed-by: Vitaly Kuznetsov Reviewed-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 286396c0aa7d..cce1e6b204c8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1571,9 +1571,9 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) struct kvm_timer *ktimer = &apic->lapic_timer; kvm_apic_local_deliver(apic, APIC_LVTT); - if (apic_lvtt_tscdeadline(apic)) + if (apic_lvtt_tscdeadline(apic)) { ktimer->tscdeadline = 0; - if (apic_lvtt_oneshot(apic)) { + } else if (apic_lvtt_oneshot(apic)) { ktimer->tscdeadline = 0; ktimer->target_expiration = 0; } -- cgit v1.2.3 From cef6db76f3165ac01ea49e023dea17002ee91618 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 21 Jan 2020 10:15:18 -0500 Subject: KVM: VMX: remove duplicated segment cache clear vmx_set_segment() clears segment cache unconditionally, so we should not clear it again by calling vmx_segment_cache_clear(). Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5087bd7062f0..802ba97ac7f2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2696,8 +2696,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 0; - vmx_segment_cache_clear(vmx); - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); -- cgit v1.2.3 From 4d6d07aee8343eac68ebde9389ba829c8c17dfc7 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Wed, 27 Nov 2019 08:30:25 +0800 Subject: kvm/x86: export kvm_vector_hashing_enabled() is unnecessary kvm_vector_hashing_enabled() is just called in kvm.ko module. Signed-off-by: Peng Hao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index baf89d4bc653..7e118883d8f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10381,7 +10381,6 @@ bool kvm_vector_hashing_enabled(void) { return vector_hashing; } -EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From 22b1d57b032cea4d612746473ed28cb20665d876 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:35 -0800 Subject: KVM: x86/mmu: Enforce max_level on HugeTLB mappings Limit KVM's mapping level for HugeTLB based on its calculated max_level. The max_level check prior to invoking host_mapping_level() only filters out the case where KVM cannot create a 2mb mapping, it doesn't handle the scenario where KVM can create a 2mb but not 1gb mapping, and the host is using a 1gb HugeTLB mapping. Fixes: 2f57b7051fe8 ("KVM: x86/mmu: Persist gfn_lpage_is_disallowed() to max_level") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b9052c7ba43d..db597f57cdc2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1330,7 +1330,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, int *max_levelp) { - int max_level = *max_levelp; + int host_level, max_level = *max_levelp; struct kvm_memory_slot *slot; if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) @@ -1362,7 +1362,8 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, * So, do not propagate host_mapping_level() to max_level as KVM can * still promote the guest mapping to a huge page in the THP case. */ - return host_mapping_level(vcpu->kvm, large_gfn); + host_level = host_mapping_level(vcpu->kvm, large_gfn); + return min(host_level, max_level); } /* -- cgit v1.2.3 From 005ba37cb89bcc0cf63c2029a41f8db165aeb615 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:36 -0800 Subject: mm: thp: KVM: Explicitly check for THP when populating secondary MMU Add a helper, is_transparent_hugepage(), to explicitly check whether a compound page is a THP and use it when populating KVM's secondary MMU. The explicit check fixes a bug where a remapped compound page, e.g. for an XDP Rx socket, is mapped into a KVM guest and is mistaken for a THP, which results in KVM incorrectly creating a huge page in its secondary MMU. Fixes: 936a5fe6e6148 ("thp: kvm mmu transparent hugepage support") Reported-by: syzbot+c9d1fb51ac9d0d10c39d@syzkaller.appspotmail.com Cc: Andrea Arcangeli Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- include/linux/huge_mm.h | 6 ++++++ include/linux/kvm_host.h | 1 + mm/huge_memory.c | 11 +++++++++++ virt/kvm/arm/mmu.c | 8 +------- virt/kvm/kvm_main.c | 10 ++++++++++ 6 files changed, 31 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index db597f57cdc2..7eb21a22cc13 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3344,7 +3344,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompoundMap(pfn_to_page(pfn))) { + kvm_is_transparent_hugepage(pfn)) { unsigned long mask; /* @@ -5961,7 +5961,7 @@ restart: */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + kvm_is_transparent_hugepage(pfn)) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0b84e13e88e2..5aca3d1bdb32 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -160,6 +160,7 @@ extern unsigned long thp_get_unmapped_area(struct file *filp, extern void prep_transhuge_page(struct page *page); extern void free_transhuge_page(struct page *page); +bool is_transparent_hugepage(struct page *page); bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); @@ -308,6 +309,11 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, static inline void prep_transhuge_page(struct page *page) {} +static inline bool is_transparent_hugepage(struct page *page) +{ + return false; +} + #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 48e139c293c2..46fdb7533678 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -976,6 +976,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); bool kvm_is_reserved_pfn(kvm_pfn_t pfn); bool kvm_is_zone_device_pfn(kvm_pfn_t pfn); +bool kvm_is_transparent_hugepage(kvm_pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 41a0fbddc96b..9b3ee79d0edf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -527,6 +527,17 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } +bool is_transparent_hugepage(struct page *page) +{ + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + return is_huge_zero_page(page) || + page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; +} +EXPORT_SYMBOL_GPL(is_transparent_hugepage); + static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, loff_t off, unsigned long flags, unsigned long size) { diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 0b32a904a1bb..dc8254bf30ea 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -1377,14 +1377,8 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) { kvm_pfn_t pfn = *pfnp; gfn_t gfn = *ipap >> PAGE_SHIFT; - struct page *page = pfn_to_page(pfn); - /* - * PageTransCompoundMap() returns true for THP and - * hugetlbfs. Make sure the adjustment is done only for THP - * pages. - */ - if (!PageHuge(page) && PageTransCompoundMap(page)) { + if (kvm_is_transparent_hugepage(pfn)) { unsigned long mask; /* * The address we faulted on is backed by a transparent huge diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ffec9f427b55..64e9e9d65ed4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -191,6 +191,16 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) return true; } +bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + + if (!PageTransCompoundMap(page)) + return false; + + return is_transparent_hugepage(compound_head(page)); +} + /* * Switches to specified vcpu, until a matching vcpu_put() */ -- cgit v1.2.3 From f9b84e19221efc5f493156ee0329df3142085f28 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:37 -0800 Subject: KVM: Use vcpu-specific gva->hva translation when querying host page size Use kvm_vcpu_gfn_to_hva() when retrieving the host page size so that the correct set of memslots is used when handling x86 page faults in SMM. Fixes: 54bf36aac520 ("KVM: x86: use vcpu-specific functions to read/write/translate GFNs") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_xive_native.c | 2 +- arch/x86/kvm/mmu/mmu.c | 6 +++--- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index d83adb1e1490..6ef0151ff70a 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -631,7 +631,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, srcu_idx = srcu_read_lock(&kvm->srcu); gfn = gpa_to_gfn(kvm_eq.qaddr); - page_size = kvm_host_page_size(kvm, gfn); + page_size = kvm_host_page_size(vcpu, gfn); if (1ull << kvm_eq.qshift > page_size) { srcu_read_unlock(&kvm->srcu, srcu_idx); pr_warn("Incompatible host page size %lx!\n", page_size); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7eb21a22cc13..e4458c9aec8c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1286,12 +1286,12 @@ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); } -static int host_mapping_level(struct kvm *kvm, gfn_t gfn) +static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn) { unsigned long page_size; int i, ret = 0; - page_size = kvm_host_page_size(kvm, gfn); + page_size = kvm_host_page_size(vcpu, gfn); for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { if (page_size >= KVM_HPAGE_SIZE(i)) @@ -1362,7 +1362,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, * So, do not propagate host_mapping_level() to max_level as KVM can * still promote the guest mapping to a huge page in the THP case. */ - host_level = host_mapping_level(vcpu->kvm, large_gfn); + host_level = host_mapping_level(vcpu, large_gfn); return min(host_level, max_level); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 46fdb7533678..6d5331b0d937 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -762,7 +762,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); -unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn); +unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 64e9e9d65ed4..f6f8ffc2e865 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1402,14 +1402,14 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); -unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) +unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr, size; size = PAGE_SIZE; - addr = gfn_to_hva(kvm, gfn); + addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return PAGE_SIZE; -- cgit v1.2.3 From 13c72c060f1ba6f4eddd7b1c4f52a8aded43d6d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:39 -0800 Subject: x86/mm: Introduce lookup_address_in_mm() Add a helper, lookup_address_in_mm(), to traverse the page tables of a given mm struct. KVM will use the helper to retrieve the host mapping level, e.g. 4k vs. 2mb vs. 1gb, of a compound (or DAX-backed) page without having to resort to implementation specific metadata. E.g. KVM currently uses different logic for HugeTLB vs. THP, and would add a third variant for DAX-backed files. Cc: Dan Williams Signed-off-by: Sean Christopherson Reviewed-by: Thomas Gleixner Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/pgtable_types.h | 4 ++++ arch/x86/mm/pageattr.c | 11 +++++++++++ 2 files changed, 15 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b5e49e6bac63..b68f72adb53e 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -561,6 +561,10 @@ static inline void update_page_count(int level, unsigned long pages) { } extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); + +struct mm_struct; +extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address, + unsigned int *level); extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b99ad05b117..2c70a8b20b04 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -618,6 +618,17 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) } EXPORT_SYMBOL_GPL(lookup_address); +/* + * Lookup the page table entry for a virtual address in a given mm. Return a + * pointer to the entry and the level of the mapping. + */ +pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address, + unsigned int *level) +{ + return lookup_address_in_pgd(pgd_offset(mm, address), address, level); +} +EXPORT_SYMBOL_GPL(lookup_address_in_mm); + static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level) { -- cgit v1.2.3 From 17eff01904f5f2fa12f4a56666637ce69ce5c645 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:40 -0800 Subject: KVM: x86/mmu: Refactor THP adjust to prep for changing query Refactor transparent_hugepage_adjust() in preparation for walking the host page tables to identify hugepage mappings, initially for THP pages, and eventualy for HugeTLB and DAX-backed pages as well. The latter cases support 1gb pages, i.e. the adjustment logic needs access to the max allowed level. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++--------------------- arch/x86/kvm/mmu/paging_tmpl.h | 3 +-- 2 files changed, 23 insertions(+), 24 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e4458c9aec8c..64c28a39d8ef 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3329,33 +3329,34 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t gfn, kvm_pfn_t *pfnp, +static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; int level = *levelp; + kvm_pfn_t mask; + + if (max_level == PT_PAGE_TABLE_LEVEL || level > PT_PAGE_TABLE_LEVEL) + return; + + if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn) || + kvm_is_zone_device_pfn(pfn)) + return; + + if (!kvm_is_transparent_hugepage(pfn)) + return; + + level = PT_DIRECTORY_LEVEL; /* - * Check if it's a transparent hugepage. If this would be an - * hugetlbfs page, level wouldn't be set to - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done - * here. + * mmu_notifier_retry() was successful and mmu_lock is held, so + * the pmd can't be split from under us. */ - if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - kvm_is_transparent_hugepage(pfn)) { - unsigned long mask; - - /* - * mmu_notifier_retry() was successful and mmu_lock is held, so - * the pmd can't be split from under us. - */ - *levelp = level = PT_DIRECTORY_LEVEL; - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - *pfnp = pfn & ~mask; - } + *levelp = level; + mask = KVM_PAGES_PER_HPAGE(level) - 1; + VM_BUG_ON((gfn & mask) != (pfn & mask)); + *pfnp = pfn & ~mask; } static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, @@ -3395,8 +3396,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - if (likely(max_level > PT_PAGE_TABLE_LEVEL)) - transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); + transparent_hugepage_adjust(vcpu, gfn, max_level, &pfn, &level); trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index eaa00c4daeb1..1ad87f0e19d0 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -688,8 +688,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); base_gfn = gfn; - if (max_level > PT_PAGE_TABLE_LEVEL) - transparent_hugepage_adjust(vcpu, gw->gfn, &pfn, &hlevel); + transparent_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, &hlevel); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); -- cgit v1.2.3 From db5432165e9b51d2a36572be38d078e79f8df0d8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:41 -0800 Subject: KVM: x86/mmu: Walk host page tables to find THP mappings Explicitly walk the host page tables to identify THP mappings instead of relying solely on the metadata in struct page. This sets the stage for using a common method of identifying huge mappings regardless of the underlying implementation (HugeTLB vs THB vs DAX), and hopefully avoids the pitfalls of relying on metadata to identify THP mappings, e.g. see commit 169226f7e0d2 ("mm: thp: handle page cache THP correctly in PageTransCompoundMap") and the need for KVM to explicitly check for a THP compound page. KVM will also naturally work with 1gb THP pages, if they are ever supported. Walking the tables for THP mappings is likely marginally slower than querying metadata, but a future patch will reuse the walk to identify HugeTLB mappings, at which point eliminating the existing VMA lookup for HugeTLB will make this a net positive. Cc: Andrea Arcangeli Cc: Barret Rhoden Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 64c28a39d8ef..3acaadb7acb8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3329,6 +3329,34 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, + kvm_pfn_t pfn) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + pte_t *pte; + int level; + + BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K || + PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M || + PT_PDPE_LEVEL != (int)PG_LEVEL_1G); + + if (!PageCompound(pfn_to_page(pfn))) + return PT_PAGE_TABLE_LEVEL; + + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); + if (!slot) + return PT_PAGE_TABLE_LEVEL; + + hva = __gfn_to_hva_memslot(slot, gfn); + + pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); + if (unlikely(!pte)) + return PT_PAGE_TABLE_LEVEL; + + return level; +} + static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp, int *levelp) @@ -3344,10 +3372,11 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_is_zone_device_pfn(pfn)) return; - if (!kvm_is_transparent_hugepage(pfn)) + level = host_pfn_mapping_level(vcpu, gfn, pfn); + if (level == PT_PAGE_TABLE_LEVEL) return; - level = PT_DIRECTORY_LEVEL; + level = min(level, max_level); /* * mmu_notifier_retry() was successful and mmu_lock is held, so -- cgit v1.2.3 From f9fa2509e5ca8229b4baca295865b542803bf25d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:42 -0800 Subject: KVM: x86/mmu: Drop level optimization from fast_page_fault() Remove fast_page_fault()'s optimization to stop the shadow walk if the iterator level drops below the intended map level. The intended map level is only acccurate for HugeTLB mappings (THP mappings are detected after fast_page_fault()), i.e. it's not required for correctness, and a future patch will also move HugeTLB mapping detection to after fast_page_fault(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3acaadb7acb8..17645c2d23e1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3586,7 +3586,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) * - true: let the vcpu to access on the same address again. * - false: let the real page fault path to fix it. */ -static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, +static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code) { struct kvm_shadow_walk_iterator iterator; @@ -3604,8 +3604,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level, u64 new_spte; for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) - if (!is_shadow_present_pte(spte) || - iterator.level < level) + if (!is_shadow_present_pte(spte)) break; sp = page_header(__pa(iterator.sptep)); @@ -4218,7 +4217,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (level > PT_PAGE_TABLE_LEVEL) gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - if (fast_page_fault(vcpu, gpa, level, error_code)) + if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; mmu_seq = vcpu->kvm->mmu_notifier_seq; -- cgit v1.2.3 From 83f06fa7a6fd9d5758e5f8438e2137f25f6f2e6b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:43 -0800 Subject: KVM: x86/mmu: Rely on host page tables to find HugeTLB mappings Remove KVM's HugeTLB specific logic and instead rely on walking the host page tables (already done for THP) to identify HugeTLB mappings. Eliminating the HugeTLB-only logic avoids taking mmap_sem and calling find_vma() for all hugepage compatible page faults, and simplifies KVM's page fault code by consolidating all hugepage adjustments into a common helper. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 84 ++++++++++++------------------------------ arch/x86/kvm/mmu/paging_tmpl.h | 15 +++----- 2 files changed, 29 insertions(+), 70 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17645c2d23e1..6be0239dcfbf 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1286,23 +1286,6 @@ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); } -static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - unsigned long page_size; - int i, ret = 0; - - page_size = kvm_host_page_size(vcpu, gfn); - - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - if (page_size >= KVM_HPAGE_SIZE(i)) - ret = i; - else - break; - } - - return ret; -} - static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, bool no_dirty_log) { @@ -1327,43 +1310,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, return slot; } -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, - int *max_levelp) +static int max_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level) { - int host_level, max_level = *max_levelp; struct kvm_memory_slot *slot; if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) return PT_PAGE_TABLE_LEVEL; - slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); - if (!memslot_valid_for_gpte(slot, true)) { - *max_levelp = PT_PAGE_TABLE_LEVEL; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!memslot_valid_for_gpte(slot, true)) return PT_PAGE_TABLE_LEVEL; - } max_level = min(max_level, kvm_x86_ops->get_lpage_level()); for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { - if (!__mmu_gfn_lpage_is_disallowed(large_gfn, max_level, slot)) + if (!__mmu_gfn_lpage_is_disallowed(gfn, max_level, slot)) break; } - *max_levelp = max_level; - - if (max_level == PT_PAGE_TABLE_LEVEL) - return PT_PAGE_TABLE_LEVEL; - - /* - * Note, host_mapping_level() does *not* handle transparent huge pages. - * As suggested by "mapping", it reflects the page size established by - * the associated vma, if there is one, i.e. host_mapping_level() will - * return a huge page level if and only if a vma exists and the backing - * implementation for the vma uses huge pages, e.g. hugetlbfs and dax. - * So, do not propagate host_mapping_level() to max_level as KVM can - * still promote the guest mapping to a huge page in the THP case. - */ - host_level = host_mapping_level(vcpu, large_gfn); - return min(host_level, max_level); + return max_level; } /* @@ -3137,7 +3102,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, /* * Other vcpu creates new sp in the window between - * mapping_level() and acquiring mmu-lock. We can + * max_mapping_level() and acquiring mmu-lock. We can * allow guest to retry the access, the mapping can * be fixed if guest refault. */ @@ -3357,24 +3322,23 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, return level; } -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - int *levelp) +static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, + int max_level, kvm_pfn_t *pfnp) { kvm_pfn_t pfn = *pfnp; - int level = *levelp; kvm_pfn_t mask; + int level; - if (max_level == PT_PAGE_TABLE_LEVEL || level > PT_PAGE_TABLE_LEVEL) - return; + if (max_level == PT_PAGE_TABLE_LEVEL) + return PT_PAGE_TABLE_LEVEL; if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn) || kvm_is_zone_device_pfn(pfn)) - return; + return PT_PAGE_TABLE_LEVEL; level = host_pfn_mapping_level(vcpu, gfn, pfn); if (level == PT_PAGE_TABLE_LEVEL) - return; + return level; level = min(level, max_level); @@ -3382,10 +3346,11 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, * mmu_notifier_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ - *levelp = level; mask = KVM_PAGES_PER_HPAGE(level) - 1; VM_BUG_ON((gfn & mask) != (pfn & mask)); *pfnp = pfn & ~mask; + + return level; } static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, @@ -3412,20 +3377,19 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, - int map_writable, int level, int max_level, - kvm_pfn_t pfn, bool prefault, - bool account_disallowed_nx_lpage) + int map_writable, int max_level, kvm_pfn_t pfn, + bool prefault, bool account_disallowed_nx_lpage) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int ret; + int level, ret; gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - transparent_hugepage_adjust(vcpu, gfn, max_level, &pfn, &level); + level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn); trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { @@ -4201,7 +4165,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; kvm_pfn_t pfn; - int level, r; + int r; if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4213,9 +4177,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (lpage_disallowed) max_level = PT_PAGE_TABLE_LEVEL; - level = mapping_level(vcpu, gfn, &max_level); - if (level > PT_PAGE_TABLE_LEVEL) - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + max_level = max_mapping_level(vcpu, gfn, max_level); if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; @@ -4235,7 +4197,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, goto out_unlock; if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - r = __direct_map(vcpu, gpa, write, map_writable, level, max_level, pfn, + r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, prefault, is_tdp && lpage_disallowed); out_unlock: diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 1ad87f0e19d0..472c32cdf2ff 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -628,14 +628,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct guest_walker *gw, - int write_fault, int hlevel, int max_level, + int write_fault, int max_level, kvm_pfn_t pfn, bool map_writable, bool prefault, bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; - int top_level, ret; + int top_level, hlevel, ret; gfn_t gfn, base_gfn; direct_access = gw->pte_access; @@ -688,7 +688,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); base_gfn = gfn; - transparent_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, &hlevel); + hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -790,7 +790,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, struct guest_walker walker; int r; kvm_pfn_t pfn; - int level; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && @@ -840,9 +839,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, else max_level = walker.level; - level = mapping_level(vcpu, walker.gfn, &max_level); - if (level > PT_PAGE_TABLE_LEVEL) - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + max_level = max_mapping_level(vcpu, walker.gfn, max_level); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -882,8 +879,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); if (make_mmu_pages_available(vcpu) < 0) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, write_fault, level, max_level, - pfn, map_writable, prefault, lpage_disallowed); + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, + map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: -- cgit v1.2.3 From 09c4453ee8e63a710774ae10da7909289aa6a58e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:44 -0800 Subject: KVM: x86/mmu: Remove obsolete gfn restoration in FNAME(fetch) Remove logic to retrieve the original gfn now that HugeTLB mappings are are identified in FNAME(fetch), i.e. FNAME(page_fault) no longer adjusts the level or gfn. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 472c32cdf2ff..885da924827c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -636,7 +636,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, hlevel, ret; - gfn_t gfn, base_gfn; + gfn_t base_gfn = gw->gfn; direct_access = gw->pte_access; @@ -681,13 +681,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - /* - * FNAME(page_fault) might have clobbered the bottom bits of - * gw->gfn, restore them from the virtual address. - */ - gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); - base_gfn = gfn; - hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn); trace_kvm_mmu_spte_requested(addr, gw->level, pfn); @@ -699,9 +692,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel); - base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == hlevel) break; -- cgit v1.2.3 From d32ec81bab670e599e645e1d1d5231d62de7d0d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:45 -0800 Subject: KVM: x86/mmu: Zap any compound page when collapsing sptes Zap any compound page, e.g. THP or HugeTLB pages, when zapping sptes that can potentially be converted to huge sptes after disabling dirty logging on the associated memslot. Note, this approach could result in false positives, e.g. if a random compound page is mapped into the guest, but mapping non-huge compound pages into the guest is far from the norm, and toggling dirty logging is not a frequent operation. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6be0239dcfbf..9090842ccd10 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5951,7 +5951,7 @@ restart: */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn) && - kvm_is_transparent_hugepage(pfn)) { + PageCompound(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) -- cgit v1.2.3 From 293e306e7faac4eafaefb9518a1cd6eaecad88e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:46 -0800 Subject: KVM: x86/mmu: Fold max_mapping_level() into kvm_mmu_hugepage_adjust() Fold max_mapping_level() into kvm_mmu_hugepage_adjust() now that HugeTLB mappings are handled in kvm_mmu_hugepage_adjust(), i.e. there isn't a need to pre-calculate the max mapping level. Co-locating all hugepage checks eliminates a memslot lookup, at the cost of performing the __mmu_gfn_lpage_is_disallowed() checks while holding mmu_lock. The latency of lpage_is_disallowed() is likely negligible relative to the rest of the code run while holding mmu_lock, and can be offset to some extent by eliminating the mmu_gfn_lpage_is_disallowed() check in set_spte() in a future patch. Eliminating the check in set_spte() is made possible by performing the initial lpage_is_disallowed() checks while holding mmu_lock. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 65 +++++++++++++++++++----------------------- arch/x86/kvm/mmu/paging_tmpl.h | 2 -- 2 files changed, 30 insertions(+), 37 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9090842ccd10..812c69f7f552 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1310,27 +1310,6 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, return slot; } -static int max_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level) -{ - struct kvm_memory_slot *slot; - - if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) - return PT_PAGE_TABLE_LEVEL; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!memslot_valid_for_gpte(slot, true)) - return PT_PAGE_TABLE_LEVEL; - - max_level = min(max_level, kvm_x86_ops->get_lpage_level()); - for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { - if (!__mmu_gfn_lpage_is_disallowed(gfn, max_level, slot)) - break; - } - - return max_level; -} - /* * About rmap_head encoding: * @@ -3101,10 +3080,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_WRITE_MASK) { /* - * Other vcpu creates new sp in the window between - * max_mapping_level() and acquiring mmu-lock. We can - * allow guest to retry the access, the mapping can - * be fixed if guest refault. + * Legacy code to handle an obsolete scenario where a different + * vcpu creates new sp in the window between this vcpu's query + * of lpage_is_disallowed() and acquiring mmu_lock. No longer + * necessary now that lpage_is_disallowed() is called after + * acquiring mmu_lock. */ if (level > PT_PAGE_TABLE_LEVEL && mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) @@ -3295,9 +3275,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) } static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, - kvm_pfn_t pfn) + kvm_pfn_t pfn, struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; unsigned long hva; pte_t *pte; int level; @@ -3309,10 +3288,14 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, if (!PageCompound(pfn_to_page(pfn))) return PT_PAGE_TABLE_LEVEL; - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); - if (!slot) - return PT_PAGE_TABLE_LEVEL; - + /* + * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() + * is not solely for performance, it's also necessary to avoid the + * "writable" check in __gfn_to_hva_many(), which will always fail on + * read-only memslots due to gfn_to_hva() assuming writes. Earlier + * page fault steps have already verified the guest isn't writing a + * read-only memslot. + */ hva = __gfn_to_hva_memslot(slot, gfn); pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); @@ -3325,18 +3308,32 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp) { + struct kvm_memory_slot *slot; kvm_pfn_t pfn = *pfnp; kvm_pfn_t mask; int level; - if (max_level == PT_PAGE_TABLE_LEVEL) + if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) return PT_PAGE_TABLE_LEVEL; if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn) || kvm_is_zone_device_pfn(pfn)) return PT_PAGE_TABLE_LEVEL; - level = host_pfn_mapping_level(vcpu, gfn, pfn); + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); + if (!slot) + return PT_PAGE_TABLE_LEVEL; + + max_level = min(max_level, kvm_x86_ops->get_lpage_level()); + for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { + if (!__mmu_gfn_lpage_is_disallowed(gfn, max_level, slot)) + break; + } + + if (max_level == PT_PAGE_TABLE_LEVEL) + return PT_PAGE_TABLE_LEVEL; + + level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); if (level == PT_PAGE_TABLE_LEVEL) return level; @@ -4177,8 +4174,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (lpage_disallowed) max_level = PT_PAGE_TABLE_LEVEL; - max_level = max_mapping_level(vcpu, gfn, max_level); - if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 885da924827c..4e1ef0473663 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -832,8 +832,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, else max_level = walker.level; - max_level = max_mapping_level(vcpu, walker.gfn, max_level); - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); -- cgit v1.2.3 From 2c0629f4b95cf5adf5b6f78f7d318df894b5f9a1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:47 -0800 Subject: KVM: x86/mmu: Remove lpage_is_disallowed() check from set_spte() Remove the late "lpage is disallowed" check from set_spte() now that the initial check is performed after acquiring mmu_lock. Fold the guts of the remaining helper, __mmu_gfn_lpage_is_disallowed(), into kvm_mmu_hugepage_adjust() to eliminate the unnecessary slot !NULL check. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 39 +++------------------------------------ 1 file changed, 3 insertions(+), 36 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 812c69f7f552..a9e6683c802b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1264,28 +1264,6 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del(&sp->lpage_disallowed_link); } -static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, - struct kvm_memory_slot *slot) -{ - struct kvm_lpage_info *linfo; - - if (slot) { - linfo = lpage_info_slot(gfn, slot, level); - return !!linfo->disallow_lpage; - } - - return true; -} - -static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, - int level) -{ - struct kvm_memory_slot *slot; - - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); -} - static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, bool no_dirty_log) { @@ -3078,18 +3056,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - - /* - * Legacy code to handle an obsolete scenario where a different - * vcpu creates new sp in the window between this vcpu's query - * of lpage_is_disallowed() and acquiring mmu_lock. No longer - * necessary now that lpage_is_disallowed() is called after - * acquiring mmu_lock. - */ - if (level > PT_PAGE_TABLE_LEVEL && - mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) - goto done; - spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; /* @@ -3121,7 +3087,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; -done: return ret; } @@ -3309,6 +3274,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, int max_level, kvm_pfn_t *pfnp) { struct kvm_memory_slot *slot; + struct kvm_lpage_info *linfo; kvm_pfn_t pfn = *pfnp; kvm_pfn_t mask; int level; @@ -3326,7 +3292,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, max_level = min(max_level, kvm_x86_ops->get_lpage_level()); for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { - if (!__mmu_gfn_lpage_is_disallowed(gfn, max_level, slot)) + linfo = lpage_info_slot(gfn, slot, max_level); + if (!linfo->disallow_lpage) break; } -- cgit v1.2.3 From e851265a816f96a86c5a0316d2fc4d45be76d1d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Jan 2020 12:24:48 -0800 Subject: KVM: x86/mmu: Use huge pages for DAX-backed files Walk the host page tables to identify hugepage mappings for ZONE_DEVICE pfns, i.e. DAX pages. Explicitly query kvm_is_zone_device_pfn() when deciding whether or not to bother walking the host page tables, as DAX pages do not set up the head/tail infrastructure, i.e. will return false for PageCompound() even when using huge pages. Zap ZONE_DEVICE sptes when disabling dirty logging, e.g. if live migration fails, to allow KVM to rebuild large pages for DAX-based mappings. Presumably DAX favors large pages, and worst case scenario is a minor performance hit as KVM will need to re-fault all DAX-based pages. Suggested-by: Barret Rhoden Cc: David Hildenbrand Cc: Dan Williams Cc: Jason Zeng Cc: Dave Jiang Cc: Liran Alon Cc: linux-nvdimm Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a9e6683c802b..febd65a9721a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3250,7 +3250,7 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M || PT_PDPE_LEVEL != (int)PG_LEVEL_1G); - if (!PageCompound(pfn_to_page(pfn))) + if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) return PT_PAGE_TABLE_LEVEL; /* @@ -3282,8 +3282,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) return PT_PAGE_TABLE_LEVEL; - if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn) || - kvm_is_zone_device_pfn(pfn)) + if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) return PT_PAGE_TABLE_LEVEL; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); @@ -5912,8 +5911,8 @@ restart: * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - !kvm_is_zone_device_pfn(pfn) && - PageCompound(pfn_to_page(pfn))) { + (kvm_is_zone_device_pfn(pfn) || + PageCompound(pfn_to_page(pfn)))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) -- cgit v1.2.3 From 91b0d268a59dd9c18221ea750b80f9a317b29ed2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Jan 2020 16:16:32 +0100 Subject: KVM: x86: inline memslot_valid_for_gpte The function now has a single caller, so there is no point in keeping it separate. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index febd65a9721a..84eeb61d06aa 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1264,17 +1264,6 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del(&sp->lpage_disallowed_link); } -static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, - bool no_dirty_log) -{ - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) - return false; - if (no_dirty_log && slot->dirty_bitmap) - return false; - - return true; -} - static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) @@ -1282,8 +1271,10 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - if (!memslot_valid_for_gpte(slot, no_dirty_log)) - slot = NULL; + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return NULL; + if (no_dirty_log && slot->dirty_bitmap) + return NULL; return slot; } -- cgit v1.2.3 From 52db369823b28616377b8ceb6b6b3879735b9e75 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 22 Jan 2020 11:21:44 +0800 Subject: KVM: X86: Add 'else' to unify fastop and execute call path It also helps eliminate some duplicated code. Signed-off-by: Miaohe Lin Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c7a0da45f60a..0accce94f660 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5683,11 +5683,9 @@ special_insn: if (ctxt->d & Fastop) { void (*fop)(struct fastop *) = (void *)ctxt->execute; rc = fastop(ctxt, fop); - if (rc != X86EMUL_CONTINUE) - goto done; - goto writeback; + } else { + rc = ctxt->execute(ctxt); } - rc = ctxt->execute(ctxt); if (rc != X86EMUL_CONTINUE) goto done; goto writeback; -- cgit v1.2.3 From 3009afc6e39e78708d8fb444ae50544b3bcd3a3f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 21 Jan 2020 20:43:39 -0800 Subject: KVM: x86: Use a typedef for fastop functions Add a typedef to for the fastop function prototype to make the code more readable. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0accce94f660..ddbc61984227 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -311,7 +311,9 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) #define ON64(x) #endif -static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); +typedef void (*fastop_t)(struct fastop *); + +static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); #define __FOP_FUNC(name) \ ".align " __stringify(FASTOP_SIZE) " \n\t" \ @@ -5502,7 +5504,7 @@ static void fetch_possible_mmx_operand(struct operand *op) read_mmx_reg(&op->mm_val, op->addr.mm); } -static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) +static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) { ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; @@ -5680,12 +5682,10 @@ special_insn: ctxt->eflags &= ~X86_EFLAGS_RF; if (ctxt->execute) { - if (ctxt->d & Fastop) { - void (*fop)(struct fastop *) = (void *)ctxt->execute; - rc = fastop(ctxt, fop); - } else { + if (ctxt->d & Fastop) + rc = fastop(ctxt, (fastop_t)ctxt->execute); + else rc = ctxt->execute(ctxt); - } if (rc != X86EMUL_CONTINUE) goto done; goto writeback; -- cgit v1.2.3 From e032e3b55b6f487e48c163c5dca74086f147a169 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Wed, 22 Jan 2020 10:25:42 +0530 Subject: KVM: PPC: Book3S HV: Release lock on page-out failure path When migrate_vma_setup() fails in kvmppc_svm_page_out(), release kvm->arch.uvmem_lock before returning. Fixes: ca9f4942670 ("KVM: PPC: Book3S HV: Support for running secure guests") Signed-off-by: Bharata B Rao Reviewed-by: Kamalesh Babulal Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 4d1f25a3959a..79b1202b1c62 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -571,7 +571,7 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, ret = migrate_vma_setup(&mig); if (ret) - return ret; + goto out; spage = migrate_pfn_to_page(*mig.src); if (!spage || !(*mig.src & MIGRATE_PFN_MIGRATE)) -- cgit v1.2.3 From fd24a8624eb29d3b6b7df68096ce0321b19b03c6 Mon Sep 17 00:00:00 2001 From: David Michael Date: Sun, 26 Jan 2020 17:31:58 -0500 Subject: KVM: PPC: Book3S PR: Fix -Werror=return-type build failure Fixes: 3a167beac07c ("kvm: powerpc: Add kvmppc_ops callback") Signed-off-by: David Michael Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_pr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index ce4fcf76e53e..eb86a2f26986 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -2030,6 +2030,7 @@ static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, { /* We should not get called */ BUG(); + return 0; } #endif /* CONFIG_PPC64 */ -- cgit v1.2.3 From 8c6de56a42e0c657955e12b882a81ef07d1d073e Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 30 Oct 2019 19:01:31 +0000 Subject: x86/kvm: Be careful not to clear KVM_VCPU_FLUSH_TLB bit kvm_steal_time_set_preempted() may accidentally clear KVM_VCPU_FLUSH_TLB bit if it is called more than once while VCPU is preempted. This is part of CVE-2019-3016. (This bug was also independently discovered by Jim Mattson ) Signed-off-by: Boris Ostrovsky Reviewed-by: Joao Martins Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf917139de6b..8c9369151e9f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3504,6 +3504,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; + if (vcpu->arch.st.steal.preempted) + return; + vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, -- cgit v1.2.3 From 917248144db5d7320655dbb41d3af0b8a0f3d589 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 5 Dec 2019 01:30:51 +0000 Subject: x86/kvm: Cache gfn to pfn translation __kvm_map_gfn()'s call to gfn_to_pfn_memslot() is * relatively expensive * in certain cases (such as when done from atomic context) cannot be called Stashing gfn-to-pfn mapping should help with both cases. This is part of CVE-2019-3016. Signed-off-by: Boris Ostrovsky Reviewed-by: Joao Martins Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 10 +++++ include/linux/kvm_host.h | 7 ++- include/linux/kvm_types.h | 9 +++- virt/kvm/kvm_main.c | 98 +++++++++++++++++++++++++++++++++-------- 5 files changed, 103 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b79cd6aa4075..f48a306e1d66 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -689,6 +689,7 @@ struct kvm_vcpu_arch { u64 last_steal; struct gfn_to_hva_cache stime; struct kvm_steal_time steal; + struct gfn_to_pfn_cache cache; } st; u64 tsc_offset; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8c9369151e9f..0795bc876abc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9088,6 +9088,9 @@ static void fx_init(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; + struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; + + kvm_release_pfn(cache->pfn, cache->dirty, cache); kvmclock_reset(vcpu); @@ -9761,11 +9764,18 @@ out_free: void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { + struct kvm_vcpu *vcpu; + int i; + /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ kvm_mmu_invalidate_mmio_sptes(kvm, gen); + + /* Force re-initialization of steal_time cache */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); } int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0cb78f55b92c..71cb9cc105f0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -723,6 +723,7 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); void kvm_get_pfn(kvm_pfn_t pfn); +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, @@ -775,10 +776,12 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map); +int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, bool atomic); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); +int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, bool dirty, bool atomic); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 1c88e69db3d9..68e84cf42a3f 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -18,7 +18,7 @@ struct kvm_memslots; enum kvm_mr_change; -#include +#include /* * Address types: @@ -51,4 +51,11 @@ struct gfn_to_hva_cache { struct kvm_memory_slot *memslot; }; +struct gfn_to_pfn_cache { + u64 generation; + gfn_t gfn; + kvm_pfn_t pfn; + bool dirty; +}; + #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9ef58a233a7c..67eb302a7240 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1821,27 +1821,72 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) +{ + if (pfn == 0) + return; + + if (cache) + cache->pfn = cache->gfn = 0; + + if (dirty) + kvm_release_pfn_dirty(pfn); + else + kvm_release_pfn_clean(pfn); +} + +static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, + struct gfn_to_pfn_cache *cache, u64 gen) +{ + kvm_release_pfn(cache->pfn, cache->dirty, cache); + + cache->pfn = gfn_to_pfn_memslot(slot, gfn); + cache->gfn = gfn; + cache->dirty = false; + cache->generation = gen; +} + static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, - struct kvm_host_map *map) + struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, + bool atomic) { kvm_pfn_t pfn; void *hva = NULL; struct page *page = KVM_UNMAPPED_PAGE; struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); + u64 gen = slots->generation; if (!map) return -EINVAL; - pfn = gfn_to_pfn_memslot(slot, gfn); + if (cache) { + if (!cache->pfn || cache->gfn != gfn || + cache->generation != gen) { + if (atomic) + return -EAGAIN; + kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); + } + pfn = cache->pfn; + } else { + if (atomic) + return -EAGAIN; + pfn = gfn_to_pfn_memslot(slot, gfn); + } if (is_error_noslot_pfn(pfn)) return -EINVAL; if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - hva = kmap(page); + if (atomic) + hva = kmap_atomic(page); + else + hva = kmap(page); #ifdef CONFIG_HAS_IOMEM - } else { + } else if (!atomic) { hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); + } else { + return -EINVAL; #endif } @@ -1856,20 +1901,25 @@ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, return 0; } -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) +int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, bool atomic) { - return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map); + return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, + cache, atomic); } EXPORT_SYMBOL_GPL(kvm_map_gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { - return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map); + return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, + NULL, false); } EXPORT_SYMBOL_GPL(kvm_vcpu_map); static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, - struct kvm_host_map *map, bool dirty) + struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, + bool dirty, bool atomic) { if (!map) return; @@ -1877,34 +1927,44 @@ static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, if (!map->hva) return; - if (map->page != KVM_UNMAPPED_PAGE) - kunmap(map->page); + if (map->page != KVM_UNMAPPED_PAGE) { + if (atomic) + kunmap_atomic(map->hva); + else + kunmap(map->page); + } #ifdef CONFIG_HAS_IOMEM - else + else if (!atomic) memunmap(map->hva); + else + WARN_ONCE(1, "Unexpected unmapping in atomic context"); #endif - if (dirty) { + if (dirty) mark_page_dirty_in_slot(memslot, map->gfn); - kvm_release_pfn_dirty(map->pfn); - } else { - kvm_release_pfn_clean(map->pfn); - } + + if (cache) + cache->dirty |= dirty; + else + kvm_release_pfn(map->pfn, dirty, NULL); map->hva = NULL; map->page = NULL; } -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) +int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, + struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) { - __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, dirty); + __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, + cache, dirty, atomic); return 0; } EXPORT_SYMBOL_GPL(kvm_unmap_gfn); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) { - __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, dirty); + __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL, + dirty, false); } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -- cgit v1.2.3 From b043138246a41064527cf019a3d51d9f015e9796 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 5 Dec 2019 03:45:32 +0000 Subject: x86/KVM: Make sure KVM_VCPU_FLUSH_TLB flag is not missed There is a potential race in record_steal_time() between setting host-local vcpu->arch.st.steal.preempted to zero (i.e. clearing KVM_VCPU_PREEMPTED) and propagating this value to the guest with kvm_write_guest_cached(). Between those two events the guest may still see KVM_VCPU_PREEMPTED in its copy of kvm_steal_time, set KVM_VCPU_FLUSH_TLB and assume that hypervisor will do the right thing. Which it won't. Instad of copying, we should map kvm_steal_time and that will guarantee atomicity of accesses to @preempted. This is part of CVE-2019-3016. Signed-off-by: Boris Ostrovsky Reviewed-by: Joao Martins Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0795bc876abc..f1845df7e7c3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2581,45 +2581,47 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) static void record_steal_time(struct kvm_vcpu *vcpu) { + struct kvm_host_map map; + struct kvm_steal_time *st; + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) + /* -EAGAIN is returned in atomic context so we can just return. */ + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, + &map, &vcpu->arch.st.cache, false)) return; + st = map.hva + + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); - if (vcpu->arch.st.steal.version & 1) - vcpu->arch.st.steal.version += 1; /* first time write, random junk */ + vcpu->arch.st.steal.preempted = 0; - vcpu->arch.st.steal.version += 1; + if (st->version & 1) + st->version += 1; /* first time write, random junk */ - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + st->version += 1; smp_wmb(); - vcpu->arch.st.steal.steal += current->sched_info.run_delay - + st->steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); - smp_wmb(); - vcpu->arch.st.steal.version += 1; + st->version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -3501,18 +3503,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { + struct kvm_host_map map; + struct kvm_steal_time *st; + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; if (vcpu->arch.st.steal.preempted) return; - vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, + &vcpu->arch.st.cache, true)) + return; + + st = map.hva + + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + + st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; - kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal.preempted, - offsetof(struct kvm_steal_time, preempted), - sizeof(vcpu->arch.st.steal.preempted)); + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From a6bd811f1209fe1c64c9f6fd578101d6436c6b6e Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Fri, 6 Dec 2019 15:36:12 +0000 Subject: x86/KVM: Clean up host's steal time structure Now that we are mapping kvm_steal_time from the guest directly we don't need keep a copy of it in kvm_vcpu_arch.st. The same is true for the stime field. This is part of CVE-2019-3016. Signed-off-by: Boris Ostrovsky Reviewed-by: Joao Martins Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/x86.c | 11 +++-------- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f48a306e1d66..4925bdbfb516 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -685,10 +685,9 @@ struct kvm_vcpu_arch { bool pvclock_set_guest_stopped_request; struct { + u8 preempted; u64 msr_val; u64 last_steal; - struct gfn_to_hva_cache stime; - struct kvm_steal_time steal; struct gfn_to_pfn_cache cache; } st; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1845df7e7c3..a0381ec905ce 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2604,7 +2604,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); - vcpu->arch.st.steal.preempted = 0; + vcpu->arch.st.preempted = 0; if (st->version & 1) st->version += 1; /* first time write, random junk */ @@ -2788,11 +2788,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS, - sizeof(struct kvm_steal_time))) - return 1; - vcpu->arch.st.msr_val = data; if (!(data & KVM_MSR_ENABLED)) @@ -3509,7 +3504,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (vcpu->arch.st.steal.preempted) + if (vcpu->arch.st.preempted) return; if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, @@ -3519,7 +3514,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) st = map.hva + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); - st->preempted = vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; + st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); } -- cgit v1.2.3