diff options
52 files changed, 1694 insertions, 287 deletions
diff --git a/Documentation/virt/kvm/vcpu-requests.rst b/Documentation/virt/kvm/vcpu-requests.rst index 5feb3706a7ae..af1b37441e0a 100644 --- a/Documentation/virt/kvm/vcpu-requests.rst +++ b/Documentation/virt/kvm/vcpu-requests.rst @@ -118,10 +118,12 @@ KVM_REQ_MMU_RELOAD necessary to inform each VCPU to completely refresh the tables. This request is used for that. -KVM_REQ_PENDING_TIMER +KVM_REQ_UNBLOCK - This request may be made from a timer handler run on the host on behalf - of a VCPU. It informs the VCPU thread to inject a timer interrupt. + This request informs the vCPU to exit kvm_vcpu_block. It is used for + example from timer handlers that run on the host on behalf of a vCPU, + or in order to update the interrupt routing and ensure that assigned + devices will wake up the vCPU. KVM_REQ_UNHALT diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index cf8df032b9c3..5e9b33cbac51 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -63,6 +63,7 @@ #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 #define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 +#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21 #ifndef __ASSEMBLY__ @@ -201,6 +202,8 @@ extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); +extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu); + extern u64 __vgic_v3_get_gic_config(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index f612c090f2e4..01b9857757f2 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -463,4 +463,9 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC; } +static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature) +{ + return test_bit(feature, vcpu->arch.features); +} + #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1cb39c0803a4..e720148232a0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -720,11 +720,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) return ret; } - if (run->immediate_exit) - return -EINTR; - vcpu_load(vcpu); + if (run->immediate_exit) { + ret = -EINTR; + goto out; + } + kvm_sigset_activate(vcpu); ret = 1; @@ -897,6 +899,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_sigset_deactivate(vcpu); +out: + /* + * In the unlikely event that we are returning to userspace + * with pending exceptions or PC adjustment, commit these + * adjustments in order to give userspace a consistent view of + * the vcpu state. Note that this relies on __kvm_adjust_pc() + * being preempt-safe on VHE. + */ + if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION | + KVM_ARM64_INCREMENT_PC))) + kvm_call_hyp(__kvm_adjust_pc, vcpu); + vcpu_put(vcpu); return ret; } diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 73629094f903..11541b94b328 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -296,7 +296,7 @@ static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) *vcpu_pc(vcpu) = vect_offset; } -void kvm_inject_exception(struct kvm_vcpu *vcpu) +static void kvm_inject_exception(struct kvm_vcpu *vcpu) { if (vcpu_el1_is_32bit(vcpu)) { switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { @@ -329,3 +329,19 @@ void kvm_inject_exception(struct kvm_vcpu *vcpu) } } } + +/* + * Adjust the guest PC (and potentially exception state) depending on + * flags provided by the emulation code. + */ +void __kvm_adjust_pc(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { + kvm_inject_exception(vcpu); + vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | + KVM_ARM64_EXCEPT_MASK); + } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { + kvm_skip_instr(vcpu); + vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; + } +} diff --git a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h index 61716359035d..4fdfeabefeb4 100644 --- a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h +++ b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h @@ -13,8 +13,6 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_host.h> -void kvm_inject_exception(struct kvm_vcpu *vcpu); - static inline void kvm_skip_instr(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) { @@ -44,22 +42,6 @@ static inline void __kvm_skip_instr(struct kvm_vcpu *vcpu) } /* - * Adjust the guest PC on entry, depending on flags provided by EL1 - * for the purpose of emulation (MMIO, sysreg) or exception injection. - */ -static inline void __adjust_pc(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { - kvm_inject_exception(vcpu); - vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | - KVM_ARM64_EXCEPT_MASK); - } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { - kvm_skip_instr(vcpu); - vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; - } -} - -/* * Skip an instruction while host sysregs are live. * Assumes host is always 64-bit. */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f36420a80474..1632f001f4ed 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -28,6 +28,13 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu)); } +static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + + __kvm_adjust_pc(kern_hyp_va(vcpu)); +} + static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt) { __kvm_flush_vm_context(); @@ -170,6 +177,7 @@ typedef void (*hcall_t)(struct kvm_cpu_context *); static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_vcpu_run), + HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index e342f7f4f4fb..4b60c0056c04 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -23,8 +23,8 @@ extern unsigned long hyp_nr_cpus; struct host_kvm host_kvm; -struct hyp_pool host_s2_mem; -struct hyp_pool host_s2_dev; +static struct hyp_pool host_s2_mem; +static struct hyp_pool host_s2_dev; /* * Copies of the host's CPU features registers holding sanitized values. diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 7488f53b0aa2..a3d3a275344e 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -17,7 +17,6 @@ #include <nvhe/trap_handler.h> struct hyp_pool hpool; -struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; unsigned long hyp_nr_cpus; #define hyp_percpu_size ((unsigned long)__per_cpu_end - \ @@ -27,6 +26,7 @@ static void *vmemmap_base; static void *hyp_pgt_base; static void *host_s2_mem_pgt_base; static void *host_s2_dev_pgt_base; +static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; static int divide_memory_pool(void *virt, unsigned long size) { diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index e9f6ea704d07..f7af9688c1f7 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -4,7 +4,6 @@ * Author: Marc Zyngier <marc.zyngier@arm.com> */ -#include <hyp/adjust_pc.h> #include <hyp/switch.h> #include <hyp/sysreg-sr.h> @@ -201,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ __debug_save_host_buffers_nvhe(vcpu); - __adjust_pc(vcpu); + __kvm_adjust_pc(vcpu); /* * We must restore the 32-bit state before the sysregs, thanks diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 7b8f7db5c1ed..b3229924d243 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -4,7 +4,6 @@ * Author: Marc Zyngier <marc.zyngier@arm.com> */ -#include <hyp/adjust_pc.h> #include <hyp/switch.h> #include <linux/arm-smccc.h> @@ -132,7 +131,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __load_guest_stage2(vcpu->arch.hw_mmu); __activate_traps(vcpu); - __adjust_pc(vcpu); + __kvm_adjust_pc(vcpu); sysreg_restore_guest_state_vhe(guest_ctxt); __debug_switch_to_guest(vcpu); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c5d1f3c87dbd..c10207fed2f3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1156,13 +1156,13 @@ out_unlock: bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) - return 0; + return false; __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, range->may_block); - return 0; + return false; } bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1170,7 +1170,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) kvm_pfn_t pfn = pte_pfn(range->pte); if (!kvm->arch.mmu.pgt) - return 0; + return false; WARN_ON(range->end - range->start != 1); @@ -1190,7 +1190,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) PAGE_SIZE, __pfn_to_phys(pfn), KVM_PGTABLE_PROT_R, NULL); - return 0; + return false; } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1200,7 +1200,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) pte_t pte; if (!kvm->arch.mmu.pgt) - return 0; + return false; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); @@ -1213,7 +1213,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) - return 0; + return false; return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 956cdc240148..d37ebee085cf 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -166,6 +166,25 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) return 0; } +static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu *tmp; + bool is32bit; + int i; + + is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit) + return false; + + /* Check that the vcpus are either all 32bit or all 64bit */ + kvm_for_each_vcpu(i, tmp, vcpu->kvm) { + if (vcpu_has_feature(tmp, KVM_ARM_VCPU_EL1_32BIT) != is32bit) + return false; + } + + return true; +} + /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer @@ -217,13 +236,14 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } } + if (!vcpu_allowed_register_width(vcpu)) { + ret = -EINVAL; + goto out; + } + switch (vcpu->arch.target) { default: if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) { - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) { - ret = -EINVAL; - goto out; - } pstate = VCPU_RESET_PSTATE_SVC; } else { pstate = VCPU_RESET_PSTATE_EL1; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 76ea2800c33e..1a7968ad078c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -399,14 +399,14 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); return true; } @@ -414,7 +414,7 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -424,7 +424,7 @@ static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -434,21 +434,21 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; } static bool trap_bcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); return true; } @@ -456,7 +456,7 @@ static bool trap_bcr(struct kvm_vcpu *vcpu, static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -467,7 +467,7 @@ static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -477,22 +477,22 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; } static bool trap_wvr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]); + trace_trap_reg(__func__, rd->CRm, p->is_write, + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]); return true; } @@ -500,7 +500,7 @@ static bool trap_wvr(struct kvm_vcpu *vcpu, static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -510,7 +510,7 @@ static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -520,21 +520,21 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; } static bool trap_wcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); return true; } @@ -542,7 +542,7 @@ static bool trap_wcr(struct kvm_vcpu *vcpu, static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -552,7 +552,7 @@ static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -562,7 +562,7 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; } static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1e83359f286b..7f2e90db2050 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -51,6 +51,7 @@ /* PPC-specific vcpu->requests bit members */ #define KVM_REQ_WATCHDOG KVM_ARCH_REQ(0) #define KVM_REQ_EPR_EXIT KVM_ARCH_REQ(1) +#define KVM_REQ_PENDING_TIMER KVM_ARCH_REQ(2) #include <linux/mmu_notifier.h> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 28a80d240b76..7360350e66ff 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3936,7 +3936,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) break; } cur = ktime_get(); - } while (single_task_running() && ktime_before(cur, stop)); + } while (kvm_vcpu_can_poll(cur, stop)); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 323641097f63..e7bef91cee04 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -99,6 +99,7 @@ KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(update_pi_irte) +KVM_X86_OP_NULL(start_assignment) KVM_X86_OP_NULL(apicv_post_state_restore) KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) KVM_X86_OP_NULL(set_hv_timer) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 55efbacfc244..9c7ced0e3171 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1352,6 +1352,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); + void (*start_assignment)(struct kvm *kvm); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8a0ccdb56076..5e5de05a8fbf 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5111,7 +5111,7 @@ done: return rc; } -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) { int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; @@ -5322,7 +5322,8 @@ done_prefixes: ctxt->execute = opcode.u.execute; - if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD))) + if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && + likely(!(ctxt->d & EmulateOnUD))) return EMULATION_FAILED; if (unlikely(ctxt->d & diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f98370a39936..f00830e5202f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1172,6 +1172,7 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); u64 gfn; + int idx; if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || @@ -1190,9 +1191,16 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm) gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; hv->tsc_ref.tsc_sequence = 0; + + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&kvm->srcu); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; + srcu_read_unlock(&kvm->srcu, idx); out_unlock: mutex_unlock(&hv->hv_lock); diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index f016838faedd..3e870bf9ca4d 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -314,7 +314,6 @@ struct x86_emulate_ctxt { int interruptibility; bool perm_ok; /* do not check permissions if true */ - bool ud; /* inject an #UD if host doesn't support insn */ bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; @@ -491,7 +490,7 @@ enum x86_intercept { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type); bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_FAILED -1 #define EMULATION_OK 0 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c0ebef560bd1..8120e8614b92 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1598,11 +1598,19 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; + if (lapic_timer_advance_dynamic) { + adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); + /* + * If the timer fired early, reread the TSC to account for the + * overhead of the above adjustment to avoid waiting longer + * than is necessary. + */ + if (guest_tsc < tsc_deadline) + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + } + if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - - if (lapic_timer_advance_dynamic) - adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) @@ -1661,7 +1669,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } atomic_inc(&apic->lapic_timer.pending); - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); if (from_timer_fn) kvm_vcpu_kick(vcpu); } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 95eeb5ac6a8a..237317b1eddd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1192,9 +1192,9 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) } /* - * Remove write access from all the SPTEs mapping GFNs [start, end). If - * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. + * Remove write access from all SPTEs at or above min_level that map GFNs + * [start, end). Returns true if an SPTE has been changed and the TLBs need to + * be flushed. */ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int min_level) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 712b4e0de481..0e62e6a2438c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -28,10 +28,8 @@ #include "svm.h" /* enable / disable AVIC */ -int avic; -#ifdef CONFIG_X86_LOCAL_APIC -module_param(avic, int, S_IRUGO); -#endif +bool avic; +module_param(avic, bool, S_IRUGO); #define SVM_AVIC_DOORBELL 0xc001011b diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 05eca131eaf2..e088086f3de6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1010,9 +1010,7 @@ static __init int svm_hardware_setup(void) } if (avic) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) { avic = false; } else { pr_info("AVIC enabled\n"); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 2c9ece618b29..2908c6ab5bb4 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -480,7 +480,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -extern int avic; +extern bool avic; static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 8dee8a5fbc17..aa0e7872fcc9 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -90,8 +90,7 @@ static inline bool cpu_has_vmx_preemption_timer(void) static inline bool cpu_has_vmx_posted_intr(void) { - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_load_ia32_efer(void) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 459748680daf..5f81ef092bd4 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -238,6 +238,20 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) /* + * Bail out of the block loop if the VM has an assigned + * device, but the blocking vCPU didn't reconfigure the + * PI.NV to the wakeup vector, i.e. the assigned device + * came along after the initial check in pi_pre_block(). + */ +void vmx_pi_start_assignment(struct kvm *kvm) +{ + if (!irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); +} + +/* * pi_update_irte - set IRTE for Posted-Interrupts * * @kvm: kvm diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 0bdc41391c5b..7f7b2326caf5 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -95,5 +95,6 @@ void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); +void vmx_pi_start_assignment(struct kvm *kvm); #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4bceb5ca3a89..50b42d7a8a11 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4843,7 +4843,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; - unsigned long cr2, rip, dr6; + unsigned long cr2, dr6; u32 vect_info; vect_info = vmx->idt_vectoring_info; @@ -4933,8 +4933,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; - rip = kvm_rip_read(vcpu); - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = ex_no; break; case AC_VECTOR: @@ -7721,6 +7720,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .nested_ops = &vmx_nested_ops, .update_pi_irte = pi_update_irte, + .start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bbc4e04e67ad..b594275d49b5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3105,6 +3105,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) st->preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); + } else { + st->preempted = 0; } vcpu->arch.st.preempted = 0; @@ -7226,6 +7228,11 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->exception.vector = -1; + ctxt->perm_ok = false; + init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } @@ -7561,14 +7568,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, kvm_vcpu_check_breakpoint(vcpu, &r)) return r; - ctxt->interruptibility = 0; - ctxt->have_exception = false; - ctxt->exception.vector = -1; - ctxt->perm_ok = false; - - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; - - r = x86_decode_insn(ctxt, insn, insn_len); + r = x86_decode_insn(ctxt, insn, insn_len, emulation_type); trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; @@ -8360,6 +8360,9 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) vcpu->stat.directed_yield_attempted++; + if (single_task_running()) + goto no_yield; + rcu_read_lock(); map = rcu_dereference(vcpu->kvm->arch.apic_map); @@ -9496,7 +9499,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -10115,8 +10118,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, kvm_update_dr7(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); + vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu); /* * Trigger an rflags update that will inject or remove the trace @@ -11499,7 +11501,8 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { - atomic_inc(&kvm->arch.assigned_device_count); + if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) + static_call_cond(kvm_x86_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2f34487e21f2..76102efbf079 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -10,6 +10,7 @@ #include <linux/spinlock.h> #include <linux/signal.h> #include <linux/sched.h> +#include <linux/sched/stat.h> #include <linux/bug.h> #include <linux/minmax.h> #include <linux/mm.h> @@ -146,7 +147,7 @@ static inline bool is_error_page(struct page *page) */ #define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_PENDING_TIMER 2 +#define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 #define KVM_REQUEST_ARCH_BASE 8 @@ -265,6 +266,11 @@ static inline bool kvm_vcpu_mapped(struct kvm_host_map *map) return !!map->hva; } +static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) +{ + return single_task_running() && !need_resched() && ktime_before(cur, stop); +} + /* * Sometimes a large or cross-page mmio needs to be broken up into separate * exits for userspace servicing. diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 3fd9a7e9d90c..79d9c44d1ad7 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -8,6 +8,7 @@ * Note: you must update KVM_API_VERSION if you change this interface. */ +#include <linux/const.h> #include <linux/types.h> #include <linux/compiler.h> #include <linux/ioctl.h> @@ -1879,8 +1880,8 @@ struct kvm_hyperv_eventfd { * conversion after harvesting an entry. Also, it must not skip any * dirty bits, so that dirty bits are always harvested in sequence. */ -#define KVM_DIRTY_GFN_F_DIRTY BIT(0) -#define KVM_DIRTY_GFN_F_RESET BIT(1) +#define KVM_DIRTY_GFN_F_DIRTY _BITUL(0) +#define KVM_DIRTY_GFN_F_RESET _BITUL(1) #define KVM_DIRTY_GFN_F_MASK 0x3 /* diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 3fd9a7e9d90c..79d9c44d1ad7 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -8,6 +8,7 @@ * Note: you must update KVM_API_VERSION if you change this interface. */ +#include <linux/const.h> #include <linux/types.h> #include <linux/compiler.h> #include <linux/ioctl.h> @@ -1879,8 +1880,8 @@ struct kvm_hyperv_eventfd { * conversion after harvesting an entry. Also, it must not skip any * dirty bits, so that dirty bits are always harvested in sequence. */ -#define KVM_DIRTY_GFN_F_DIRTY BIT(0) -#define KVM_DIRTY_GFN_F_RESET BIT(1) +#define KVM_DIRTY_GFN_F_DIRTY _BITUL(0) +#define KVM_DIRTY_GFN_F_RESET _BITUL(1) #define KVM_DIRTY_GFN_F_MASK 0x3 /* diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index bd83158e0e0b..524c857a049c 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -41,5 +41,6 @@ /kvm_create_max_vcpus /kvm_page_table_test /memslot_modification_stress_test +/memslot_perf_test /set_memory_region_test /steal_time diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index e439d027939d..daaee1888b12 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -33,7 +33,7 @@ ifeq ($(ARCH),s390) UNAME_M := s390x endif -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c +LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/rbtree.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c @@ -74,6 +74,7 @@ TEST_GEN_PROGS_x86_64 += hardware_disable_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += kvm_page_table_test TEST_GEN_PROGS_x86_64 += memslot_modification_stress_test +TEST_GEN_PROGS_x86_64 += memslot_perf_test TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 5f7a229c3af1..b74704305835 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -9,6 +9,7 @@ #define _GNU_SOURCE /* for pipe2 */ +#include <inttypes.h> #include <stdio.h> #include <stdlib.h> #include <time.h> @@ -38,6 +39,7 @@ static int nr_vcpus = 1; static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; +static size_t demand_paging_size; static char *guest_data_prototype; static void *vcpu_worker(void *data) @@ -71,36 +73,51 @@ static void *vcpu_worker(void *data) return NULL; } -static int handle_uffd_page_request(int uffd, uint64_t addr) +static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr) { - pid_t tid; + pid_t tid = syscall(__NR_gettid); struct timespec start; struct timespec ts_diff; - struct uffdio_copy copy; int r; - tid = syscall(__NR_gettid); + clock_gettime(CLOCK_MONOTONIC, &start); - copy.src = (uint64_t)guest_data_prototype; - copy.dst = addr; - copy.len = perf_test_args.host_page_size; - copy.mode = 0; + if (uffd_mode == UFFDIO_REGISTER_MODE_MISSING) { + struct uffdio_copy copy; - clock_gettime(CLOCK_MONOTONIC, &start); + copy.src = (uint64_t)guest_data_prototype; + copy.dst = addr; + copy.len = demand_paging_size; + copy.mode = 0; - r = ioctl(uffd, UFFDIO_COPY, ©); - if (r == -1) { - pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n", - addr, tid, errno); - return r; + r = ioctl(uffd, UFFDIO_COPY, ©); + if (r == -1) { + pr_info("Failed UFFDIO_COPY in 0x%lx from thread %d with errno: %d\n", + addr, tid, errno); + return r; + } + } else if (uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { + struct uffdio_continue cont = {0}; + + cont.range.start = addr; + cont.range.len = demand_paging_size; + + r = ioctl(uffd, UFFDIO_CONTINUE, &cont); + if (r == -1) { + pr_info("Failed UFFDIO_CONTINUE in 0x%lx from thread %d with errno: %d\n", + addr, tid, errno); + return r; + } + } else { + TEST_FAIL("Invalid uffd mode %d", uffd_mode); } ts_diff = timespec_elapsed(start); - PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid, + PER_PAGE_DEBUG("UFFD page-in %d \t%ld ns\n", tid, timespec_to_ns(ts_diff)); PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n", - perf_test_args.host_page_size, addr, tid); + demand_paging_size, addr, tid); return 0; } @@ -108,6 +125,7 @@ static int handle_uffd_page_request(int uffd, uint64_t addr) bool quit_uffd_thread; struct uffd_handler_args { + int uffd_mode; int uffd; int pipefd; useconds_t delay; @@ -169,7 +187,7 @@ static void *uffd_handler_thread_fn(void *arg) if (r == -1) { if (errno == EAGAIN) continue; - pr_info("Read of uffd gor errno %d", errno); + pr_info("Read of uffd got errno %d\n", errno); return NULL; } @@ -184,7 +202,7 @@ static void *uffd_handler_thread_fn(void *arg) if (delay) usleep(delay); addr = msg.arg.pagefault.address; - r = handle_uffd_page_request(uffd, addr); + r = handle_uffd_page_request(uffd_args->uffd_mode, uffd, addr); if (r < 0) return NULL; pages++; @@ -198,43 +216,53 @@ static void *uffd_handler_thread_fn(void *arg) return NULL; } -static int setup_demand_paging(struct kvm_vm *vm, - pthread_t *uffd_handler_thread, int pipefd, - useconds_t uffd_delay, - struct uffd_handler_args *uffd_args, - void *hva, uint64_t len) +static void setup_demand_paging(struct kvm_vm *vm, + pthread_t *uffd_handler_thread, int pipefd, + int uffd_mode, useconds_t uffd_delay, + struct uffd_handler_args *uffd_args, + void *hva, void *alias, uint64_t len) { + bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR); int uffd; struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; + uint64_t expected_ioctls = ((uint64_t) 1) << _UFFDIO_COPY; - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd == -1) { - pr_info("uffd creation failed\n"); - return -1; + PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", + is_minor ? "MINOR" : "MISSING", + is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); + + /* In order to get minor faults, prefault via the alias. */ + if (is_minor) { + size_t p; + + expected_ioctls = ((uint64_t) 1) << _UFFDIO_CONTINUE; + + TEST_ASSERT(alias != NULL, "Alias required for minor faults"); + for (p = 0; p < (len / demand_paging_size); ++p) { + memcpy(alias + (p * demand_paging_size), + guest_data_prototype, demand_paging_size); + } } + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno); + uffdio_api.api = UFFD_API; uffdio_api.features = 0; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { - pr_info("ioctl uffdio_api failed\n"); - return -1; - } + TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1, + "ioctl UFFDIO_API failed: %" PRIu64, + (uint64_t)uffdio_api.api); uffdio_register.range.start = (uint64_t)hva; uffdio_register.range.len = len; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { - pr_info("ioctl uffdio_register failed\n"); - return -1; - } - - if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) != - UFFD_API_RANGE_IOCTLS) { - pr_info("unexpected userfaultfd ioctl set\n"); - return -1; - } + uffdio_register.mode = uffd_mode; + TEST_ASSERT(ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) != -1, + "ioctl UFFDIO_REGISTER failed"); + TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) == + expected_ioctls, "missing userfaultfd ioctls"); + uffd_args->uffd_mode = uffd_mode; uffd_args->uffd = uffd; uffd_args->pipefd = pipefd; uffd_args->delay = uffd_delay; @@ -243,13 +271,12 @@ static int setup_demand_paging(struct kvm_vm *vm, PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n", hva, hva + len); - - return 0; } struct test_params { - bool use_uffd; + int uffd_mode; useconds_t uffd_delay; + enum vm_mem_backing_src_type src_type; bool partition_vcpu_memory_access; }; @@ -267,14 +294,16 @@ static void run_test(enum vm_guest_mode mode, void *arg) int r; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, - VM_MEM_SRC_ANONYMOUS); + p->src_type); perf_test_args.wr_fract = 1; - guest_data_prototype = malloc(perf_test_args.host_page_size); + demand_paging_size = get_backing_src_pagesz(p->src_type); + + guest_data_prototype = malloc(demand_paging_size); TEST_ASSERT(guest_data_prototype, "Failed to allocate buffer for guest data pattern"); - memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size); + memset(guest_data_prototype, 0xAB, demand_paging_size); vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); @@ -282,7 +311,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, p->partition_vcpu_memory_access); - if (p->use_uffd) { + if (p->uffd_mode) { uffd_handler_threads = malloc(nr_vcpus * sizeof(*uffd_handler_threads)); TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); @@ -296,6 +325,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { vm_paddr_t vcpu_gpa; void *vcpu_hva; + void *vcpu_alias; uint64_t vcpu_mem_size; @@ -310,8 +340,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size); - /* Cache the HVA pointer of the region */ + /* Cache the host addresses of the region */ vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); + vcpu_alias = addr_gpa2alias(vm, vcpu_gpa); /* * Set up user fault fd to handle demand paging @@ -321,13 +352,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) O_CLOEXEC | O_NONBLOCK); TEST_ASSERT(!r, "Failed to set up pipefd"); - r = setup_demand_paging(vm, - &uffd_handler_threads[vcpu_id], - pipefds[vcpu_id * 2], - p->uffd_delay, &uffd_args[vcpu_id], - vcpu_hva, vcpu_mem_size); - if (r < 0) - exit(-r); + setup_demand_paging(vm, &uffd_handler_threads[vcpu_id], + pipefds[vcpu_id * 2], p->uffd_mode, + p->uffd_delay, &uffd_args[vcpu_id], + vcpu_hva, vcpu_alias, + vcpu_mem_size); } } @@ -355,7 +384,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("All vCPU threads joined\n"); - if (p->use_uffd) { + if (p->uffd_mode) { char c; /* Tell the user fault fd handler threads to quit */ @@ -377,7 +406,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) free(guest_data_prototype); free(vcpu_threads); - if (p->use_uffd) { + if (p->uffd_mode) { free(uffd_handler_threads); free(uffd_args); free(pipefds); @@ -387,17 +416,19 @@ static void run_test(enum vm_guest_mode mode, void *arg) static void help(char *name) { puts(""); - printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" - " [-b memory] [-v vcpus] [-o]\n", name); + printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" + " [-b memory] [-t type] [-v vcpus] [-o]\n", name); guest_modes_help(); - printf(" -u: use User Fault FD to handle vCPU page\n" - " faults.\n"); + printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" + " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); printf(" -d: add a delay in usec to the User Fault\n" " FD handler to simulate demand paging\n" " overheads. Ignored without -u.\n"); printf(" -b: specify the size of the memory region which should be\n" " demand paged by each vCPU. e.g. 10M or 3G.\n" " Default: 1G\n"); + printf(" -t: The type of backing memory to use. Default: anonymous\n"); + backing_src_help(); printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); @@ -409,19 +440,24 @@ int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); struct test_params p = { + .src_type = VM_MEM_SRC_ANONYMOUS, .partition_vcpu_memory_access = true, }; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:ud:b:v:o")) != -1) { + while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); break; case 'u': - p.use_uffd = true; + if (!strcmp("MISSING", optarg)) + p.uffd_mode = UFFDIO_REGISTER_MODE_MISSING; + else if (!strcmp("MINOR", optarg)) + p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR; + TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'."); break; case 'd': p.uffd_delay = strtoul(optarg, NULL, 0); @@ -430,6 +466,9 @@ int main(int argc, char *argv[]) case 'b': guest_percpu_mem_size = parse_size(optarg); break; + case 't': + p.src_type = parse_backing_src_type(optarg); + break; case 'v': nr_vcpus = atoi(optarg); TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, @@ -445,6 +484,11 @@ int main(int argc, char *argv[]) } } + if (p.uffd_mode == UFFDIO_REGISTER_MODE_MINOR && + !backing_src_is_shared(p.src_type)) { + TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -t"); + } + for_each_guest_mode(run_test, &p); return 0; diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 5aadf84c91c0..4b8db3bce610 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -132,6 +132,36 @@ static void run_test(uint32_t run) TEST_ASSERT(false, "%s: [%d] child escaped the ninja\n", __func__, run); } +void wait_for_child_setup(pid_t pid) +{ + /* + * Wait for the child to post to the semaphore, but wake up periodically + * to check if the child exited prematurely. + */ + for (;;) { + const struct timespec wait_period = { .tv_sec = 1 }; + int status; + + if (!sem_timedwait(sem, &wait_period)) + return; + + /* Child is still running, keep waiting. */ + if (pid != waitpid(pid, &status, WNOHANG)) + continue; + + /* + * Child is no longer running, which is not expected. + * + * If it exited with a non-zero status, we explicitly forward + * the child's status in case it exited with KSFT_SKIP. + */ + if (WIFEXITED(status)) + exit(WEXITSTATUS(status)); + else + TEST_ASSERT(false, "Child exited unexpectedly"); + } +} + int main(int argc, char **argv) { uint32_t i; @@ -148,7 +178,7 @@ int main(int argc, char **argv) run_test(i); /* This function always exits */ pr_debug("%s: [%d] waiting semaphore\n", __func__, i); - sem_wait(sem); + wait_for_child_setup(pid); r = (rand() % DELAY_US_MAX) + 1; pr_debug("%s: [%d] waiting %dus\n", __func__, i, r); usleep(r); diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index a8f022794ce3..fcd8e3855111 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -77,6 +77,7 @@ struct vm_guest_mode_params { }; extern const struct vm_guest_mode_params vm_guest_mode_params[]; +int open_kvm_dev_path_or_exit(void); int kvm_check_cap(long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id, @@ -146,6 +147,7 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa); void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva); vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva); +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa); /* * Address Guest Virtual to Guest Physical @@ -302,7 +304,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm); unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); -unsigned int vm_get_max_gfn(struct kvm_vm *vm); +uint64_t vm_get_max_gfn(struct kvm_vm *vm); int vm_get_fd(struct kvm_vm *vm); unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index fade3130eb01..d79be15dd3d2 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -17,6 +17,7 @@ #include <errno.h> #include <unistd.h> #include <fcntl.h> +#include <sys/mman.h> #include "kselftest.h" static inline int _no_printf(const char *format, ...) { return 0; } @@ -84,6 +85,8 @@ enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB, VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB, VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB, + VM_MEM_SRC_SHMEM, + VM_MEM_SRC_SHARED_HUGETLB, NUM_SRC_TYPES, }; @@ -100,4 +103,13 @@ size_t get_backing_src_pagesz(uint32_t i); void backing_src_help(void); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); +/* + * Whether or not the given source type is shared memory (as opposed to + * anonymous). + */ +static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t) +{ + return vm_mem_backing_src_alias(t)->flag & MAP_SHARED; +} + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index fc83f6c5902d..28e528c19d28 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -32,6 +32,34 @@ static void *align(void *x, size_t size) } /* + * Open KVM_DEV_PATH if available, otherwise exit the entire program. + * + * Input Args: + * flags - The flags to pass when opening KVM_DEV_PATH. + * + * Return: + * The opened file descriptor of /dev/kvm. + */ +static int _open_kvm_dev_path_or_exit(int flags) +{ + int fd; + + fd = open(KVM_DEV_PATH, flags); + if (fd < 0) { + print_skip("%s not available, is KVM loaded? (errno: %d)", + KVM_DEV_PATH, errno); + exit(KSFT_SKIP); + } + + return fd; +} + +int open_kvm_dev_path_or_exit(void) +{ + return _open_kvm_dev_path_or_exit(O_RDONLY); +} + +/* * Capability * * Input Args: @@ -52,10 +80,7 @@ int kvm_check_cap(long cap) int ret; int kvm_fd; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); - + kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap); TEST_ASSERT(ret != -1, "KVM_CHECK_EXTENSION IOCTL failed,\n" " rc: %i errno: %i", ret, errno); @@ -128,9 +153,7 @@ void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size) static void vm_open(struct kvm_vm *vm, int perm) { - vm->kvm_fd = open(KVM_DEV_PATH, perm); - if (vm->kvm_fd < 0) - exit(KSFT_SKIP); + vm->kvm_fd = _open_kvm_dev_path_or_exit(perm); if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) { print_skip("immediate_exit not available"); @@ -203,7 +226,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) TEST_ASSERT(vm != NULL, "Insufficient Memory"); INIT_LIST_HEAD(&vm->vcpus); - INIT_LIST_HEAD(&vm->userspace_mem_regions); + vm->regions.gpa_tree = RB_ROOT; + vm->regions.hva_tree = RB_ROOT; + hash_init(vm->regions.slot_hash); vm->mode = mode; vm->type = 0; @@ -295,7 +320,7 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, */ uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus; uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; + uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages; struct kvm_vm *vm; int i; @@ -355,13 +380,14 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, */ void kvm_vm_restart(struct kvm_vm *vmp, int perm) { + int ctr; struct userspace_mem_region *region; vm_open(vmp, perm); if (vmp->has_irqchip) vm_create_irqchip(vmp); - list_for_each_entry(region, &vmp->userspace_mem_regions, list) { + hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" " rc: %i errno: %i\n" @@ -424,14 +450,21 @@ uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm) static struct userspace_mem_region * userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) { - struct userspace_mem_region *region; + struct rb_node *node; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + for (node = vm->regions.gpa_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, gpa_node); uint64_t existing_start = region->region.guest_phys_addr; uint64_t existing_end = region->region.guest_phys_addr + region->region.memory_size - 1; if (start <= existing_end && end >= existing_start) return region; + + if (start < existing_start) + node = node->rb_left; + else + node = node->rb_right; } return NULL; @@ -546,11 +579,16 @@ void kvm_vm_release(struct kvm_vm *vmp) } static void __vm_mem_region_delete(struct kvm_vm *vm, - struct userspace_mem_region *region) + struct userspace_mem_region *region, + bool unlink) { int ret; - list_del(®ion->list); + if (unlink) { + rb_erase(®ion->gpa_node, &vm->regions.gpa_tree); + rb_erase(®ion->hva_node, &vm->regions.hva_tree); + hash_del(®ion->slot_node); + } region->region.memory_size = 0; ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); @@ -569,14 +607,16 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, */ void kvm_vm_free(struct kvm_vm *vmp) { - struct userspace_mem_region *region, *tmp; + int ctr; + struct hlist_node *node; + struct userspace_mem_region *region; if (vmp == NULL) return; /* Free userspace_mem_regions. */ - list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list) - __vm_mem_region_delete(vmp, region); + hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) + __vm_mem_region_delete(vmp, region, false); /* Free sparsebit arrays. */ sparsebit_free(&vmp->vpages_valid); @@ -658,13 +698,64 @@ int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len) return 0; } +static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), gpa_node); + parent = *cur; + if (region->region.guest_phys_addr < + cregion->region.guest_phys_addr) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->region.guest_phys_addr != + cregion->region.guest_phys_addr, + "Duplicate GPA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->gpa_node, parent, cur); + rb_insert_color(®ion->gpa_node, gpa_tree); +} + +static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree, + struct userspace_mem_region *region) +{ + struct rb_node **cur, *parent; + + for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) { + struct userspace_mem_region *cregion; + + cregion = container_of(*cur, typeof(*cregion), hva_node); + parent = *cur; + if (region->host_mem < cregion->host_mem) + cur = &(*cur)->rb_left; + else { + TEST_ASSERT(region->host_mem != + cregion->host_mem, + "Duplicate HVA in region tree"); + + cur = &(*cur)->rb_right; + } + } + + rb_link_node(®ion->hva_node, parent, cur); + rb_insert_color(®ion->hva_node, hva_tree); +} + /* * VM Userspace Memory Region Add * * Input Args: * vm - Virtual Machine - * backing_src - Storage source for this region. - * NULL to use anonymous memory. + * src_type - Storage source for this region. + * NULL to use anonymous memory. * guest_paddr - Starting guest physical address * slot - KVM region slot * npages - Number of physical pages @@ -722,7 +813,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, (uint64_t) region->region.memory_size); /* Confirm no region with the requested slot already exists. */ - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + slot) { if (region->region.slot != slot) continue; @@ -755,11 +847,30 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, if (alignment > 1) region->mmap_size += alignment; + region->fd = -1; + if (backing_src_is_shared(src_type)) { + int memfd_flags = MFD_CLOEXEC; + + if (src_type == VM_MEM_SRC_SHARED_HUGETLB) + memfd_flags |= MFD_HUGETLB; + + region->fd = memfd_create("kvm_selftest", memfd_flags); + TEST_ASSERT(region->fd != -1, + "memfd_create failed, errno: %i", errno); + + ret = ftruncate(region->fd, region->mmap_size); + TEST_ASSERT(ret == 0, "ftruncate failed, errno: %i", errno); + + ret = fallocate(region->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + region->mmap_size); + TEST_ASSERT(ret == 0, "fallocate failed, errno: %i", errno); + } + region->mmap_start = mmap(NULL, region->mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS - | vm_mem_backing_src_alias(src_type)->flag, - -1, 0); + vm_mem_backing_src_alias(src_type)->flag, + region->fd, 0); TEST_ASSERT(region->mmap_start != MAP_FAILED, "test_malloc failed, mmap_start: %p errno: %i", region->mmap_start, errno); @@ -793,8 +904,23 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, ret, errno, slot, flags, guest_paddr, (uint64_t) region->region.memory_size); - /* Add to linked-list of memory regions. */ - list_add(®ion->list, &vm->userspace_mem_regions); + /* Add to quick lookup data structures */ + vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); + vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region); + hash_add(vm->regions.slot_hash, ®ion->slot_node, slot); + + /* If shared memory, create an alias. */ + if (region->fd >= 0) { + region->mmap_alias = mmap(NULL, region->mmap_size, + PROT_READ | PROT_WRITE, + vm_mem_backing_src_alias(src_type)->flag, + region->fd, 0); + TEST_ASSERT(region->mmap_alias != MAP_FAILED, + "mmap of alias failed, errno: %i", errno); + + /* Align host alias address */ + region->host_alias = align(region->mmap_alias, alignment); + } } /* @@ -817,10 +943,10 @@ memslot2region(struct kvm_vm *vm, uint32_t memslot) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each_possible(vm->regions.slot_hash, region, slot_node, + memslot) if (region->region.slot == memslot) return region; - } fprintf(stderr, "No mem region with the requested slot found,\n" " requested slot: %u\n", memslot); @@ -905,7 +1031,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) */ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) { - __vm_mem_region_delete(vm, memslot2region(vm, slot)); + __vm_mem_region_delete(vm, memslot2region(vm, slot), true); } /* @@ -925,9 +1051,7 @@ static int vcpu_mmap_sz(void) { int dev_fd, ret; - dev_fd = open(KVM_DEV_PATH, O_RDONLY); - if (dev_fd < 0) - exit(KSFT_SKIP); + dev_fd = open_kvm_dev_path_or_exit(); ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL); TEST_ASSERT(ret >= sizeof(struct kvm_run), @@ -1099,6 +1223,9 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0); virt_pgd_alloc(vm, pgd_memslot); + vm_paddr_t paddr = vm_phy_pages_alloc(vm, pages, + KVM_UTIL_MIN_PFN * vm->page_size, + data_memslot); /* * Find an unused range of virtual page addresses of at least @@ -1108,11 +1235,7 @@ vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min, /* Map the virtual pages. */ for (vm_vaddr_t vaddr = vaddr_start; pages > 0; - pages--, vaddr += vm->page_size) { - vm_paddr_t paddr; - - paddr = vm_phy_page_alloc(vm, - KVM_UTIL_MIN_PFN * vm->page_size, data_memslot); + pages--, vaddr += vm->page_size, paddr += vm->page_size) { virt_pg_map(vm, vaddr, paddr, pgd_memslot); @@ -1177,16 +1300,14 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) { struct userspace_mem_region *region; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((gpa >= region->region.guest_phys_addr) - && (gpa <= (region->region.guest_phys_addr - + region->region.memory_size - 1))) - return (void *) ((uintptr_t) region->host_mem - + (gpa - region->region.guest_phys_addr)); + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) { + TEST_FAIL("No vm physical memory at 0x%lx", gpa); + return NULL; } - TEST_FAIL("No vm physical memory at 0x%lx", gpa); - return NULL; + return (void *)((uintptr_t)region->host_mem + + (gpa - region->region.guest_phys_addr)); } /* @@ -1208,15 +1329,22 @@ void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa) */ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) { - struct userspace_mem_region *region; + struct rb_node *node; - list_for_each_entry(region, &vm->userspace_mem_regions, list) { - if ((hva >= region->host_mem) - && (hva <= (region->host_mem - + region->region.memory_size - 1))) - return (vm_paddr_t) ((uintptr_t) - region->region.guest_phys_addr - + (hva - (uintptr_t) region->host_mem)); + for (node = vm->regions.hva_tree.rb_node; node; ) { + struct userspace_mem_region *region = + container_of(node, struct userspace_mem_region, hva_node); + + if (hva >= region->host_mem) { + if (hva <= (region->host_mem + + region->region.memory_size - 1)) + return (vm_paddr_t)((uintptr_t) + region->region.guest_phys_addr + + (hva - (uintptr_t)region->host_mem)); + + node = node->rb_right; + } else + node = node->rb_left; } TEST_FAIL("No mapping to a guest physical address, hva: %p", hva); @@ -1224,6 +1352,42 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva) } /* + * Address VM physical to Host Virtual *alias*. + * + * Input Args: + * vm - Virtual Machine + * gpa - VM physical address + * + * Output Args: None + * + * Return: + * Equivalent address within the host virtual *alias* area, or NULL + * (without failing the test) if the guest memory is not shared (so + * no alias exists). + * + * When vm_create() and related functions are called with a shared memory + * src_type, we also create a writable, shared alias mapping of the + * underlying guest memory. This allows the host to manipulate guest memory + * without mapping that memory in the guest's address space. And, for + * userfaultfd-based demand paging, we can do so without triggering userfaults. + */ +void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa) +{ + struct userspace_mem_region *region; + uintptr_t offset; + + region = userspace_mem_region_find(vm, gpa, gpa); + if (!region) + return NULL; + + if (!region->host_alias) + return NULL; + + offset = gpa - region->region.guest_phys_addr; + return (void *) ((uintptr_t) region->host_alias + offset); +} + +/* * VM Create IRQ Chip * * Input Args: @@ -1822,6 +1986,7 @@ int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, */ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { + int ctr; struct userspace_mem_region *region; struct vcpu *vcpu; @@ -1829,7 +1994,7 @@ void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd); fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size); fprintf(stream, "%*sMem Regions:\n", indent, ""); - list_for_each_entry(region, &vm->userspace_mem_regions, list) { + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) { fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx " "host_virt: %p\n", indent + 2, "", (uint64_t) region->region.guest_phys_addr, @@ -2015,10 +2180,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm) if (vm == NULL) { /* Ensure that the KVM vendor-specific module is loaded. */ - f = fopen(KVM_DEV_PATH, "r"); - TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d", - errno); - fclose(f); + close(open_kvm_dev_path_or_exit()); } f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r"); @@ -2041,7 +2203,7 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm) return vm->page_shift; } -unsigned int vm_get_max_gfn(struct kvm_vm *vm) +uint64_t vm_get_max_gfn(struct kvm_vm *vm) { return vm->max_gfn; } diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 91ce1b5d480b..a03febc24ba6 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -8,6 +8,9 @@ #ifndef SELFTEST_KVM_UTIL_INTERNAL_H #define SELFTEST_KVM_UTIL_INTERNAL_H +#include "linux/hashtable.h" +#include "linux/rbtree.h" + #include "sparsebit.h" struct userspace_mem_region { @@ -16,9 +19,13 @@ struct userspace_mem_region { int fd; off_t offset; void *host_mem; + void *host_alias; void *mmap_start; + void *mmap_alias; size_t mmap_size; - struct list_head list; + struct rb_node gpa_node; + struct rb_node hva_node; + struct hlist_node slot_node; }; struct vcpu { @@ -31,6 +38,12 @@ struct vcpu { uint32_t dirty_gfns_count; }; +struct userspace_mem_regions { + struct rb_root gpa_tree; + struct rb_root hva_tree; + DECLARE_HASHTABLE(slot_hash, 9); +}; + struct kvm_vm { int mode; unsigned long type; @@ -43,7 +56,7 @@ struct kvm_vm { unsigned int va_bits; uint64_t max_gfn; struct list_head vcpus; - struct list_head userspace_mem_regions; + struct userspace_mem_regions regions; struct sparsebit *vpages_valid; struct sparsebit *vpages_mapped; bool has_irqchip; diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 81490b9b4e32..abf381800a59 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2020, Google LLC. */ +#include <inttypes.h> #include "kvm_util.h" #include "perf_test_util.h" @@ -80,7 +81,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, */ TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), "Requested more guest memory than address space allows.\n" - " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", + " guest pages: %" PRIx64 " max gfn: %" PRIx64 + " vcpus: %d wss: %" PRIx64 "]\n", guest_num_pages, vm_get_max_gfn(vm), vcpus, vcpu_memory_bytes); diff --git a/tools/testing/selftests/kvm/lib/rbtree.c b/tools/testing/selftests/kvm/lib/rbtree.c new file mode 100644 index 000000000000..a703f0194ea3 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/rbtree.c @@ -0,0 +1 @@ +#include "../../../../lib/rbtree.c" diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 63d2bc7d757b..6ad6c8276b2e 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -168,70 +168,87 @@ size_t get_def_hugetlb_pagesz(void) const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i) { + static const int anon_flags = MAP_PRIVATE | MAP_ANONYMOUS; + static const int anon_huge_flags = anon_flags | MAP_HUGETLB; + static const struct vm_mem_backing_src_alias aliases[] = { [VM_MEM_SRC_ANONYMOUS] = { .name = "anonymous", - .flag = 0, + .flag = anon_flags, }, [VM_MEM_SRC_ANONYMOUS_THP] = { .name = "anonymous_thp", - .flag = 0, + .flag = anon_flags, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB] = { .name = "anonymous_hugetlb", - .flag = MAP_HUGETLB, + .flag = anon_huge_flags, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16KB] = { .name = "anonymous_hugetlb_16kb", - .flag = MAP_HUGETLB | MAP_HUGE_16KB, + .flag = anon_huge_flags | MAP_HUGE_16KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_64KB] = { .name = "anonymous_hugetlb_64kb", - .flag = MAP_HUGETLB | MAP_HUGE_64KB, + .flag = anon_huge_flags | MAP_HUGE_64KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_512KB] = { .name = "anonymous_hugetlb_512kb", - .flag = MAP_HUGETLB | MAP_HUGE_512KB, + .flag = anon_huge_flags | MAP_HUGE_512KB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_1MB] = { .name = "anonymous_hugetlb_1mb", - .flag = MAP_HUGETLB | MAP_HUGE_1MB, + .flag = anon_huge_flags | MAP_HUGE_1MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_2MB] = { .name = "anonymous_hugetlb_2mb", - .flag = MAP_HUGETLB | MAP_HUGE_2MB, + .flag = anon_huge_flags | MAP_HUGE_2MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_8MB] = { .name = "anonymous_hugetlb_8mb", - .flag = MAP_HUGETLB | MAP_HUGE_8MB, + .flag = anon_huge_flags | MAP_HUGE_8MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16MB] = { .name = "anonymous_hugetlb_16mb", - .flag = MAP_HUGETLB | MAP_HUGE_16MB, + .flag = anon_huge_flags | MAP_HUGE_16MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_32MB] = { .name = "anonymous_hugetlb_32mb", - .flag = MAP_HUGETLB | MAP_HUGE_32MB, + .flag = anon_huge_flags | MAP_HUGE_32MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_256MB] = { .name = "anonymous_hugetlb_256mb", - .flag = MAP_HUGETLB | MAP_HUGE_256MB, + .flag = anon_huge_flags | MAP_HUGE_256MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_512MB] = { .name = "anonymous_hugetlb_512mb", - .flag = MAP_HUGETLB | MAP_HUGE_512MB, + .flag = anon_huge_flags | MAP_HUGE_512MB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB] = { .name = "anonymous_hugetlb_1gb", - .flag = MAP_HUGETLB | MAP_HUGE_1GB, + .flag = anon_huge_flags | MAP_HUGE_1GB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_2GB] = { .name = "anonymous_hugetlb_2gb", - .flag = MAP_HUGETLB | MAP_HUGE_2GB, + .flag = anon_huge_flags | MAP_HUGE_2GB, }, [VM_MEM_SRC_ANONYMOUS_HUGETLB_16GB] = { .name = "anonymous_hugetlb_16gb", - .flag = MAP_HUGETLB | MAP_HUGE_16GB, + .flag = anon_huge_flags | MAP_HUGE_16GB, + }, + [VM_MEM_SRC_SHMEM] = { + .name = "shmem", + .flag = MAP_SHARED, + }, + [VM_MEM_SRC_SHARED_HUGETLB] = { + .name = "shared_hugetlb", + /* + * No MAP_HUGETLB, we use MFD_HUGETLB instead. Since + * we're using "file backed" memory, we need to specify + * this when the FD is created, not when the area is + * mapped. + */ + .flag = MAP_SHARED, }, }; _Static_assert(ARRAY_SIZE(aliases) == NUM_SRC_TYPES, @@ -250,10 +267,12 @@ size_t get_backing_src_pagesz(uint32_t i) switch (i) { case VM_MEM_SRC_ANONYMOUS: + case VM_MEM_SRC_SHMEM: return getpagesize(); case VM_MEM_SRC_ANONYMOUS_THP: return get_trans_hugepagesz(); case VM_MEM_SRC_ANONYMOUS_HUGETLB: + case VM_MEM_SRC_SHARED_HUGETLB: return get_def_hugetlb_pagesz(); default: return MAP_HUGE_PAGE_SIZE(flag); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index a8906e60a108..efe235044421 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -657,9 +657,7 @@ struct kvm_cpuid2 *kvm_get_supported_cpuid(void) return cpuid; cpuid = allocate_kvm_cpuid2(); - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid); TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n", @@ -691,9 +689,7 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index) buffer.header.nmsrs = 1; buffer.entry.index = msr_index; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); r = ioctl(kvm_fd, KVM_GET_MSRS, &buffer.header); TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" @@ -986,9 +982,7 @@ struct kvm_msr_list *kvm_get_msr_index_list(void) struct kvm_msr_list *list; int nmsrs, r, kvm_fd; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); nmsrs = kvm_get_num_msrs_fd(kvm_fd); list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); @@ -1312,9 +1306,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void) return cpuid; cpuid = allocate_kvm_cpuid2(); - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid); TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_HV_CPUID failed %d %d\n", diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 6096bf0a5b34..98351ba0933c 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -71,14 +71,22 @@ struct memslot_antagonist_args { }; static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, - uint64_t nr_modifications, uint64_t gpa) + uint64_t nr_modifications) { + const uint64_t pages = 1; + uint64_t gpa; int i; + /* + * Add the dummy memslot just below the perf_test_util memslot, which is + * at the top of the guest physical address space. + */ + gpa = guest_test_phys_mem - pages * vm_get_page_size(vm); + for (i = 0; i < nr_modifications; i++) { usleep(delay); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, - DUMMY_MEMSLOT_INDEX, 1, 0); + DUMMY_MEMSLOT_INDEX, pages, 0); vm_mem_region_delete(vm, DUMMY_MEMSLOT_INDEX); } @@ -120,11 +128,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("Started all vCPUs\n"); add_remove_memslot(vm, p->memslot_modification_delay, - p->nr_memslot_modifications, - guest_test_phys_mem + - (guest_percpu_mem_size * nr_vcpus) + - perf_test_args.host_page_size + - perf_test_args.guest_page_size); + p->nr_memslot_modifications); run_vcpus = false; diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c new file mode 100644 index 000000000000..9307f25d8130 --- /dev/null +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -0,0 +1,1037 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A memslot-related performance benchmark. + * + * Copyright (C) 2021 Oracle and/or its affiliates. + * + * Basic guest setup / host vCPU thread code lifted from set_memory_region_test. + */ +#include <pthread.h> +#include <sched.h> +#include <semaphore.h> +#include <stdatomic.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <time.h> +#include <unistd.h> + +#include <linux/compiler.h> + +#include <test_util.h> +#include <kvm_util.h> +#include <processor.h> + +#define VCPU_ID 0 + +#define MEM_SIZE ((512U << 20) + 4096) +#define MEM_SIZE_PAGES (MEM_SIZE / 4096) +#define MEM_GPA 0x10000000UL +#define MEM_AUX_GPA MEM_GPA +#define MEM_SYNC_GPA MEM_AUX_GPA +#define MEM_TEST_GPA (MEM_AUX_GPA + 4096) +#define MEM_TEST_SIZE (MEM_SIZE - 4096) +static_assert(MEM_SIZE % 4096 == 0, "invalid mem size"); +static_assert(MEM_TEST_SIZE % 4096 == 0, "invalid mem test size"); + +/* + * 32 MiB is max size that gets well over 100 iterations on 509 slots. + * Considering that each slot needs to have at least one page up to + * 8194 slots in use can then be tested (although with slightly + * limited resolution). + */ +#define MEM_SIZE_MAP ((32U << 20) + 4096) +#define MEM_SIZE_MAP_PAGES (MEM_SIZE_MAP / 4096) +#define MEM_TEST_MAP_SIZE (MEM_SIZE_MAP - 4096) +#define MEM_TEST_MAP_SIZE_PAGES (MEM_TEST_MAP_SIZE / 4096) +static_assert(MEM_SIZE_MAP % 4096 == 0, "invalid map test region size"); +static_assert(MEM_TEST_MAP_SIZE % 4096 == 0, "invalid map test region size"); +static_assert(MEM_TEST_MAP_SIZE_PAGES % 2 == 0, "invalid map test region size"); +static_assert(MEM_TEST_MAP_SIZE_PAGES > 2, "invalid map test region size"); + +/* + * 128 MiB is min size that fills 32k slots with at least one page in each + * while at the same time gets 100+ iterations in such test + */ +#define MEM_TEST_UNMAP_SIZE (128U << 20) +#define MEM_TEST_UNMAP_SIZE_PAGES (MEM_TEST_UNMAP_SIZE / 4096) +/* 2 MiB chunk size like a typical huge page */ +#define MEM_TEST_UNMAP_CHUNK_PAGES (2U << (20 - 12)) +static_assert(MEM_TEST_UNMAP_SIZE <= MEM_TEST_SIZE, + "invalid unmap test region size"); +static_assert(MEM_TEST_UNMAP_SIZE % 4096 == 0, + "invalid unmap test region size"); +static_assert(MEM_TEST_UNMAP_SIZE_PAGES % + (2 * MEM_TEST_UNMAP_CHUNK_PAGES) == 0, + "invalid unmap test region size"); + +/* + * For the move active test the middle of the test area is placed on + * a memslot boundary: half lies in the memslot being moved, half in + * other memslot(s). + * + * When running this test with 32k memslots (32764, really) each memslot + * contains 4 pages. + * The last one additionally contains the remaining 21 pages of memory, + * for the total size of 25 pages. + * Hence, the maximum size here is 50 pages. + */ +#define MEM_TEST_MOVE_SIZE_PAGES (50) +#define MEM_TEST_MOVE_SIZE (MEM_TEST_MOVE_SIZE_PAGES * 4096) +#define MEM_TEST_MOVE_GPA_DEST (MEM_GPA + MEM_SIZE) +static_assert(MEM_TEST_MOVE_SIZE <= MEM_TEST_SIZE, + "invalid move test region size"); + +#define MEM_TEST_VAL_1 0x1122334455667788 +#define MEM_TEST_VAL_2 0x99AABBCCDDEEFF00 + +struct vm_data { + struct kvm_vm *vm; + pthread_t vcpu_thread; + uint32_t nslots; + uint64_t npages; + uint64_t pages_per_slot; + void **hva_slots; + bool mmio_ok; + uint64_t mmio_gpa_min; + uint64_t mmio_gpa_max; +}; + +struct sync_area { + atomic_bool start_flag; + atomic_bool exit_flag; + atomic_bool sync_flag; + void *move_area_ptr; +}; + +/* + * Technically, we need also for the atomic bool to be address-free, which + * is recommended, but not strictly required, by C11 for lockless + * implementations. + * However, in practice both GCC and Clang fulfill this requirement on + * all KVM-supported platforms. + */ +static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless"); + +static sem_t vcpu_ready; + +static bool map_unmap_verify; + +static bool verbose; +#define pr_info_v(...) \ + do { \ + if (verbose) \ + pr_info(__VA_ARGS__); \ + } while (0) + +static void *vcpu_worker(void *data) +{ + struct vm_data *vm = data; + struct kvm_run *run; + struct ucall uc; + uint64_t cmd; + + run = vcpu_state(vm->vm, VCPU_ID); + while (1) { + vcpu_run(vm->vm, VCPU_ID); + + if (run->exit_reason == KVM_EXIT_IO) { + cmd = get_ucall(vm->vm, VCPU_ID, &uc); + if (cmd != UCALL_SYNC) + break; + + sem_post(&vcpu_ready); + continue; + } + + if (run->exit_reason != KVM_EXIT_MMIO) + break; + + TEST_ASSERT(vm->mmio_ok, "Unexpected mmio exit"); + TEST_ASSERT(run->mmio.is_write, "Unexpected mmio read"); + TEST_ASSERT(run->mmio.len == 8, + "Unexpected exit mmio size = %u", run->mmio.len); + TEST_ASSERT(run->mmio.phys_addr >= vm->mmio_gpa_min && + run->mmio.phys_addr <= vm->mmio_gpa_max, + "Unexpected exit mmio address = 0x%llx", + run->mmio.phys_addr); + } + + if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT) + TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2]); + + return NULL; +} + +static void wait_for_vcpu(void) +{ + struct timespec ts; + + TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts), + "clock_gettime() failed: %d\n", errno); + + ts.tv_sec += 2; + TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts), + "sem_timedwait() failed: %d\n", errno); +} + +static void *vm_gpa2hva(struct vm_data *data, uint64_t gpa, uint64_t *rempages) +{ + uint64_t gpage, pgoffs; + uint32_t slot, slotoffs; + void *base; + + TEST_ASSERT(gpa >= MEM_GPA, "Too low gpa to translate"); + TEST_ASSERT(gpa < MEM_GPA + data->npages * 4096, + "Too high gpa to translate"); + gpa -= MEM_GPA; + + gpage = gpa / 4096; + pgoffs = gpa % 4096; + slot = min(gpage / data->pages_per_slot, (uint64_t)data->nslots - 1); + slotoffs = gpage - (slot * data->pages_per_slot); + + if (rempages) { + uint64_t slotpages; + + if (slot == data->nslots - 1) + slotpages = data->npages - slot * data->pages_per_slot; + else + slotpages = data->pages_per_slot; + + TEST_ASSERT(!pgoffs, + "Asking for remaining pages in slot but gpa not page aligned"); + *rempages = slotpages - slotoffs; + } + + base = data->hva_slots[slot]; + return (uint8_t *)base + slotoffs * 4096 + pgoffs; +} + +static uint64_t vm_slot2gpa(struct vm_data *data, uint32_t slot) +{ + TEST_ASSERT(slot < data->nslots, "Too high slot number"); + + return MEM_GPA + slot * data->pages_per_slot * 4096; +} + +static struct vm_data *alloc_vm(void) +{ + struct vm_data *data; + + data = malloc(sizeof(*data)); + TEST_ASSERT(data, "malloc(vmdata) failed"); + + data->vm = NULL; + data->hva_slots = NULL; + + return data; +} + +static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots, + void *guest_code, uint64_t mempages, + struct timespec *slot_runtime) +{ + uint32_t max_mem_slots; + uint64_t rempages; + uint64_t guest_addr; + uint32_t slot; + struct timespec tstart; + struct sync_area *sync; + + max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS); + TEST_ASSERT(max_mem_slots > 1, + "KVM_CAP_NR_MEMSLOTS should be greater than 1"); + TEST_ASSERT(nslots > 1 || nslots == -1, + "Slot count cap should be greater than 1"); + if (nslots != -1) + max_mem_slots = min(max_mem_slots, (uint32_t)nslots); + pr_info_v("Allowed number of memory slots: %"PRIu32"\n", max_mem_slots); + + TEST_ASSERT(mempages > 1, + "Can't test without any memory"); + + data->npages = mempages; + data->nslots = max_mem_slots - 1; + data->pages_per_slot = mempages / data->nslots; + if (!data->pages_per_slot) { + *maxslots = mempages + 1; + return false; + } + + rempages = mempages % data->nslots; + data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots); + TEST_ASSERT(data->hva_slots, "malloc() fail"); + + data->vm = vm_create_default(VCPU_ID, 1024, guest_code); + + pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n", + max_mem_slots - 1, data->pages_per_slot, rempages); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + for (slot = 1, guest_addr = MEM_GPA; slot < max_mem_slots; slot++) { + uint64_t npages; + + npages = data->pages_per_slot; + if (slot == max_mem_slots - 1) + npages += rempages; + + vm_userspace_mem_region_add(data->vm, VM_MEM_SRC_ANONYMOUS, + guest_addr, slot, npages, + 0); + guest_addr += npages * 4096; + } + *slot_runtime = timespec_elapsed(tstart); + + for (slot = 0, guest_addr = MEM_GPA; slot < max_mem_slots - 1; slot++) { + uint64_t npages; + uint64_t gpa; + + npages = data->pages_per_slot; + if (slot == max_mem_slots - 2) + npages += rempages; + + gpa = vm_phy_pages_alloc(data->vm, npages, guest_addr, + slot + 1); + TEST_ASSERT(gpa == guest_addr, + "vm_phy_pages_alloc() failed\n"); + + data->hva_slots[slot] = addr_gpa2hva(data->vm, guest_addr); + memset(data->hva_slots[slot], 0, npages * 4096); + + guest_addr += npages * 4096; + } + + virt_map(data->vm, MEM_GPA, MEM_GPA, mempages, 0); + + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); + atomic_init(&sync->start_flag, false); + atomic_init(&sync->exit_flag, false); + atomic_init(&sync->sync_flag, false); + + data->mmio_ok = false; + + return true; +} + +static void launch_vm(struct vm_data *data) +{ + pr_info_v("Launching the test VM\n"); + + pthread_create(&data->vcpu_thread, NULL, vcpu_worker, data); + + /* Ensure the guest thread is spun up. */ + wait_for_vcpu(); +} + +static void free_vm(struct vm_data *data) +{ + kvm_vm_free(data->vm); + free(data->hva_slots); + free(data); +} + +static void wait_guest_exit(struct vm_data *data) +{ + pthread_join(data->vcpu_thread, NULL); +} + +static void let_guest_run(struct sync_area *sync) +{ + atomic_store_explicit(&sync->start_flag, true, memory_order_release); +} + +static void guest_spin_until_start(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + while (!atomic_load_explicit(&sync->start_flag, memory_order_acquire)) + ; +} + +static void make_guest_exit(struct sync_area *sync) +{ + atomic_store_explicit(&sync->exit_flag, true, memory_order_release); +} + +static bool _guest_should_exit(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + return atomic_load_explicit(&sync->exit_flag, memory_order_acquire); +} + +#define guest_should_exit() unlikely(_guest_should_exit()) + +/* + * noinline so we can easily see how much time the host spends waiting + * for the guest. + * For the same reason use alarm() instead of polling clock_gettime() + * to implement a wait timeout. + */ +static noinline void host_perform_sync(struct sync_area *sync) +{ + alarm(2); + + atomic_store_explicit(&sync->sync_flag, true, memory_order_release); + while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire)) + ; + + alarm(0); +} + +static bool guest_perform_sync(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + bool expected; + + do { + if (guest_should_exit()) + return false; + + expected = true; + } while (!atomic_compare_exchange_weak_explicit(&sync->sync_flag, + &expected, false, + memory_order_acq_rel, + memory_order_relaxed)); + + return true; +} + +static void guest_code_test_memslot_move(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + uintptr_t base = (typeof(base))READ_ONCE(sync->move_area_ptr); + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (!guest_should_exit()) { + uintptr_t ptr; + + for (ptr = base; ptr < base + MEM_TEST_MOVE_SIZE; + ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + /* + * No host sync here since the MMIO exits are so expensive + * that the host would spend most of its time waiting for + * the guest and so instead of measuring memslot move + * performance we would measure the performance and + * likelihood of MMIO exits + */ + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_map(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr; + + for (ptr = MEM_TEST_GPA; + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + for (ptr = MEM_TEST_GPA + MEM_TEST_MAP_SIZE / 2; + ptr < MEM_TEST_GPA + MEM_TEST_MAP_SIZE; ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_2; + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_unmap(void) +{ + struct sync_area *sync = (typeof(sync))MEM_SYNC_GPA; + + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr = MEM_TEST_GPA; + + /* + * We can afford to access (map) just a small number of pages + * per host sync as otherwise the host will spend + * a significant amount of its time waiting for the guest + * (instead of doing unmap operations), so this will + * effectively turn this test into a map performance test. + * + * Just access a single page to be on the safe side. + */ + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + ptr += MEM_TEST_UNMAP_SIZE / 2; + *(uint64_t *)ptr = MEM_TEST_VAL_2; + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static void guest_code_test_memslot_rw(void) +{ + GUEST_SYNC(0); + + guest_spin_until_start(); + + while (1) { + uintptr_t ptr; + + for (ptr = MEM_TEST_GPA; + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) + *(uint64_t *)ptr = MEM_TEST_VAL_1; + + if (!guest_perform_sync()) + break; + + for (ptr = MEM_TEST_GPA + 4096 / 2; + ptr < MEM_TEST_GPA + MEM_TEST_SIZE; ptr += 4096) { + uint64_t val = *(uint64_t *)ptr; + + GUEST_ASSERT_1(val == MEM_TEST_VAL_2, val); + *(uint64_t *)ptr = 0; + } + + if (!guest_perform_sync()) + break; + } + + GUEST_DONE(); +} + +static bool test_memslot_move_prepare(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots, bool isactive) +{ + uint64_t movesrcgpa, movetestgpa; + + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); + + if (isactive) { + uint64_t lastpages; + + vm_gpa2hva(data, movesrcgpa, &lastpages); + if (lastpages < MEM_TEST_MOVE_SIZE_PAGES / 2) { + *maxslots = 0; + return false; + } + } + + movetestgpa = movesrcgpa - (MEM_TEST_MOVE_SIZE / (isactive ? 2 : 1)); + sync->move_area_ptr = (void *)movetestgpa; + + if (isactive) { + data->mmio_ok = true; + data->mmio_gpa_min = movesrcgpa; + data->mmio_gpa_max = movesrcgpa + MEM_TEST_MOVE_SIZE / 2 - 1; + } + + return true; +} + +static bool test_memslot_move_prepare_active(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots) +{ + return test_memslot_move_prepare(data, sync, maxslots, true); +} + +static bool test_memslot_move_prepare_inactive(struct vm_data *data, + struct sync_area *sync, + uint64_t *maxslots) +{ + return test_memslot_move_prepare(data, sync, maxslots, false); +} + +static void test_memslot_move_loop(struct vm_data *data, struct sync_area *sync) +{ + uint64_t movesrcgpa; + + movesrcgpa = vm_slot2gpa(data, data->nslots - 1); + vm_mem_region_move(data->vm, data->nslots - 1 + 1, + MEM_TEST_MOVE_GPA_DEST); + vm_mem_region_move(data->vm, data->nslots - 1 + 1, movesrcgpa); +} + +static void test_memslot_do_unmap(struct vm_data *data, + uint64_t offsp, uint64_t count) +{ + uint64_t gpa, ctr; + + for (gpa = MEM_TEST_GPA + offsp * 4096, ctr = 0; ctr < count; ) { + uint64_t npages; + void *hva; + int ret; + + hva = vm_gpa2hva(data, gpa, &npages); + TEST_ASSERT(npages, "Empty memory slot at gptr 0x%"PRIx64, gpa); + npages = min(npages, count - ctr); + ret = madvise(hva, npages * 4096, MADV_DONTNEED); + TEST_ASSERT(!ret, + "madvise(%p, MADV_DONTNEED) on VM memory should not fail for gptr 0x%"PRIx64, + hva, gpa); + ctr += npages; + gpa += npages * 4096; + } + TEST_ASSERT(ctr == count, + "madvise(MADV_DONTNEED) should exactly cover all of the requested area"); +} + +static void test_memslot_map_unmap_check(struct vm_data *data, + uint64_t offsp, uint64_t valexp) +{ + uint64_t gpa; + uint64_t *val; + + if (!map_unmap_verify) + return; + + gpa = MEM_TEST_GPA + offsp * 4096; + val = (typeof(val))vm_gpa2hva(data, gpa, NULL); + TEST_ASSERT(*val == valexp, + "Guest written values should read back correctly before unmap (%"PRIu64" vs %"PRIu64" @ %"PRIx64")", + *val, valexp, gpa); + *val = 0; +} + +static void test_memslot_map_loop(struct vm_data *data, struct sync_area *sync) +{ + /* + * Unmap the second half of the test area while guest writes to (maps) + * the first half. + */ + test_memslot_do_unmap(data, MEM_TEST_MAP_SIZE_PAGES / 2, + MEM_TEST_MAP_SIZE_PAGES / 2); + + /* + * Wait for the guest to finish writing the first half of the test + * area, verify the written value on the first and the last page of + * this area and then unmap it. + * Meanwhile, the guest is writing to (mapping) the second half of + * the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); + test_memslot_map_unmap_check(data, + MEM_TEST_MAP_SIZE_PAGES / 2 - 1, + MEM_TEST_VAL_1); + test_memslot_do_unmap(data, 0, MEM_TEST_MAP_SIZE_PAGES / 2); + + + /* + * Wait for the guest to finish writing the second half of the test + * area and verify the written value on the first and the last page + * of this area. + * The area will be unmapped at the beginning of the next loop + * iteration. + * Meanwhile, the guest is writing to (mapping) the first half of + * the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES / 2, + MEM_TEST_VAL_2); + test_memslot_map_unmap_check(data, MEM_TEST_MAP_SIZE_PAGES - 1, + MEM_TEST_VAL_2); +} + +static void test_memslot_unmap_loop_common(struct vm_data *data, + struct sync_area *sync, + uint64_t chunk) +{ + uint64_t ctr; + + /* + * Wait for the guest to finish mapping page(s) in the first half + * of the test area, verify the written value and then perform unmap + * of this area. + * Meanwhile, the guest is writing to (mapping) page(s) in the second + * half of the test area. + */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, 0, MEM_TEST_VAL_1); + for (ctr = 0; ctr < MEM_TEST_UNMAP_SIZE_PAGES / 2; ctr += chunk) + test_memslot_do_unmap(data, ctr, chunk); + + /* Likewise, but for the opposite host / guest areas */ + host_perform_sync(sync); + test_memslot_map_unmap_check(data, MEM_TEST_UNMAP_SIZE_PAGES / 2, + MEM_TEST_VAL_2); + for (ctr = MEM_TEST_UNMAP_SIZE_PAGES / 2; + ctr < MEM_TEST_UNMAP_SIZE_PAGES; ctr += chunk) + test_memslot_do_unmap(data, ctr, chunk); +} + +static void test_memslot_unmap_loop(struct vm_data *data, + struct sync_area *sync) +{ + test_memslot_unmap_loop_common(data, sync, 1); +} + +static void test_memslot_unmap_loop_chunked(struct vm_data *data, + struct sync_area *sync) +{ + test_memslot_unmap_loop_common(data, sync, MEM_TEST_UNMAP_CHUNK_PAGES); +} + +static void test_memslot_rw_loop(struct vm_data *data, struct sync_area *sync) +{ + uint64_t gptr; + + for (gptr = MEM_TEST_GPA + 4096 / 2; + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) + *(uint64_t *)vm_gpa2hva(data, gptr, NULL) = MEM_TEST_VAL_2; + + host_perform_sync(sync); + + for (gptr = MEM_TEST_GPA; + gptr < MEM_TEST_GPA + MEM_TEST_SIZE; gptr += 4096) { + uint64_t *vptr = (typeof(vptr))vm_gpa2hva(data, gptr, NULL); + uint64_t val = *vptr; + + TEST_ASSERT(val == MEM_TEST_VAL_1, + "Guest written values should read back correctly (is %"PRIu64" @ %"PRIx64")", + val, gptr); + *vptr = 0; + } + + host_perform_sync(sync); +} + +struct test_data { + const char *name; + uint64_t mem_size; + void (*guest_code)(void); + bool (*prepare)(struct vm_data *data, struct sync_area *sync, + uint64_t *maxslots); + void (*loop)(struct vm_data *data, struct sync_area *sync); +}; + +static bool test_execute(int nslots, uint64_t *maxslots, + unsigned int maxtime, + const struct test_data *tdata, + uint64_t *nloops, + struct timespec *slot_runtime, + struct timespec *guest_runtime) +{ + uint64_t mem_size = tdata->mem_size ? : MEM_SIZE_PAGES; + struct vm_data *data; + struct sync_area *sync; + struct timespec tstart; + bool ret = true; + + data = alloc_vm(); + if (!prepare_vm(data, nslots, maxslots, tdata->guest_code, + mem_size, slot_runtime)) { + ret = false; + goto exit_free; + } + + sync = (typeof(sync))vm_gpa2hva(data, MEM_SYNC_GPA, NULL); + + if (tdata->prepare && + !tdata->prepare(data, sync, maxslots)) { + ret = false; + goto exit_free; + } + + launch_vm(data); + + clock_gettime(CLOCK_MONOTONIC, &tstart); + let_guest_run(sync); + + while (1) { + *guest_runtime = timespec_elapsed(tstart); + if (guest_runtime->tv_sec >= maxtime) + break; + + tdata->loop(data, sync); + + (*nloops)++; + } + + make_guest_exit(sync); + wait_guest_exit(data); + +exit_free: + free_vm(data); + + return ret; +} + +static const struct test_data tests[] = { + { + .name = "map", + .mem_size = MEM_SIZE_MAP_PAGES, + .guest_code = guest_code_test_memslot_map, + .loop = test_memslot_map_loop, + }, + { + .name = "unmap", + .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, + .guest_code = guest_code_test_memslot_unmap, + .loop = test_memslot_unmap_loop, + }, + { + .name = "unmap chunked", + .mem_size = MEM_TEST_UNMAP_SIZE_PAGES + 1, + .guest_code = guest_code_test_memslot_unmap, + .loop = test_memslot_unmap_loop_chunked, + }, + { + .name = "move active area", + .guest_code = guest_code_test_memslot_move, + .prepare = test_memslot_move_prepare_active, + .loop = test_memslot_move_loop, + }, + { + .name = "move inactive area", + .guest_code = guest_code_test_memslot_move, + .prepare = test_memslot_move_prepare_inactive, + .loop = test_memslot_move_loop, + }, + { + .name = "RW", + .guest_code = guest_code_test_memslot_rw, + .loop = test_memslot_rw_loop + }, +}; + +#define NTESTS ARRAY_SIZE(tests) + +struct test_args { + int tfirst; + int tlast; + int nslots; + int seconds; + int runs; +}; + +static void help(char *name, struct test_args *targs) +{ + int ctr; + + pr_info("usage: %s [-h] [-v] [-d] [-s slots] [-f first_test] [-e last_test] [-l test_length] [-r run_count]\n", + name); + pr_info(" -h: print this help screen.\n"); + pr_info(" -v: enable verbose mode (not for benchmarking).\n"); + pr_info(" -d: enable extra debug checks.\n"); + pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n", + targs->nslots); + pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n", + targs->tfirst, NTESTS - 1); + pr_info(" -e: specify the last test to run (currently: %i; max %zu)\n", + targs->tlast, NTESTS - 1); + pr_info(" -l: specify the test length in seconds (currently: %i)\n", + targs->seconds); + pr_info(" -r: specify the number of runs per test (currently: %i)\n", + targs->runs); + + pr_info("\nAvailable tests:\n"); + for (ctr = 0; ctr < NTESTS; ctr++) + pr_info("%d: %s\n", ctr, tests[ctr].name); +} + +static bool parse_args(int argc, char *argv[], + struct test_args *targs) +{ + int opt; + + while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) { + switch (opt) { + case 'h': + default: + help(argv[0], targs); + return false; + case 'v': + verbose = true; + break; + case 'd': + map_unmap_verify = true; + break; + case 's': + targs->nslots = atoi(optarg); + if (targs->nslots <= 0 && targs->nslots != -1) { + pr_info("Slot count cap has to be positive or -1 for no cap\n"); + return false; + } + break; + case 'f': + targs->tfirst = atoi(optarg); + if (targs->tfirst < 0) { + pr_info("First test to run has to be non-negative\n"); + return false; + } + break; + case 'e': + targs->tlast = atoi(optarg); + if (targs->tlast < 0 || targs->tlast >= NTESTS) { + pr_info("Last test to run has to be non-negative and less than %zu\n", + NTESTS); + return false; + } + break; + case 'l': + targs->seconds = atoi(optarg); + if (targs->seconds < 0) { + pr_info("Test length in seconds has to be non-negative\n"); + return false; + } + break; + case 'r': + targs->runs = atoi(optarg); + if (targs->runs <= 0) { + pr_info("Runs per test has to be positive\n"); + return false; + } + break; + } + } + + if (optind < argc) { + help(argv[0], targs); + return false; + } + + if (targs->tfirst > targs->tlast) { + pr_info("First test to run cannot be greater than the last test to run\n"); + return false; + } + + return true; +} + +struct test_result { + struct timespec slot_runtime, guest_runtime, iter_runtime; + int64_t slottimens, runtimens; + uint64_t nloops; +}; + +static bool test_loop(const struct test_data *data, + const struct test_args *targs, + struct test_result *rbestslottime, + struct test_result *rbestruntime) +{ + uint64_t maxslots; + struct test_result result; + + result.nloops = 0; + if (!test_execute(targs->nslots, &maxslots, targs->seconds, data, + &result.nloops, + &result.slot_runtime, &result.guest_runtime)) { + if (maxslots) + pr_info("Memslot count too high for this test, decrease the cap (max is %"PRIu64")\n", + maxslots); + else + pr_info("Memslot count may be too high for this test, try adjusting the cap\n"); + + return false; + } + + pr_info("Test took %ld.%.9lds for slot setup + %ld.%.9lds all iterations\n", + result.slot_runtime.tv_sec, result.slot_runtime.tv_nsec, + result.guest_runtime.tv_sec, result.guest_runtime.tv_nsec); + if (!result.nloops) { + pr_info("No full loops done - too short test time or system too loaded?\n"); + return true; + } + + result.iter_runtime = timespec_div(result.guest_runtime, + result.nloops); + pr_info("Done %"PRIu64" iterations, avg %ld.%.9lds each\n", + result.nloops, + result.iter_runtime.tv_sec, + result.iter_runtime.tv_nsec); + result.slottimens = timespec_to_ns(result.slot_runtime); + result.runtimens = timespec_to_ns(result.iter_runtime); + + /* + * Only rank the slot setup time for tests using the whole test memory + * area so they are comparable + */ + if (!data->mem_size && + (!rbestslottime->slottimens || + result.slottimens < rbestslottime->slottimens)) + *rbestslottime = result; + if (!rbestruntime->runtimens || + result.runtimens < rbestruntime->runtimens) + *rbestruntime = result; + + return true; +} + +int main(int argc, char *argv[]) +{ + struct test_args targs = { + .tfirst = 0, + .tlast = NTESTS - 1, + .nslots = -1, + .seconds = 5, + .runs = 1, + }; + struct test_result rbestslottime; + int tctr; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + if (!parse_args(argc, argv, &targs)) + return -1; + + rbestslottime.slottimens = 0; + for (tctr = targs.tfirst; tctr <= targs.tlast; tctr++) { + const struct test_data *data = &tests[tctr]; + unsigned int runctr; + struct test_result rbestruntime; + + if (tctr > targs.tfirst) + pr_info("\n"); + + pr_info("Testing %s performance with %i runs, %d seconds each\n", + data->name, targs.runs, targs.seconds); + + rbestruntime.runtimens = 0; + for (runctr = 0; runctr < targs.runs; runctr++) + if (!test_loop(data, &targs, + &rbestslottime, &rbestruntime)) + break; + + if (rbestruntime.runtimens) + pr_info("Best runtime result was %ld.%.9lds per iteration (with %"PRIu64" iterations)\n", + rbestruntime.iter_runtime.tv_sec, + rbestruntime.iter_runtime.tv_nsec, + rbestruntime.nloops); + } + + if (rbestslottime.slottimens) + pr_info("Best slot setup time for the whole test area was %ld.%.9lds\n", + rbestslottime.slot_runtime.tv_sec, + rbestslottime.slot_runtime.tv_nsec); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c index 9b78e8889638..8c77537af5a1 100644 --- a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c @@ -19,7 +19,12 @@ struct { u32 function; u32 index; } mangled_cpuids[] = { + /* + * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR, + * which are not controlled for by this test. + */ {.function = 0xd, .index = 0}, + {.function = 0xd, .index = 1}, }; static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c index cb953df4d7d0..8aed0db1331d 100644 --- a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -37,9 +37,7 @@ static void test_get_msr_index(void) int old_res, res, kvm_fd, r; struct kvm_msr_list *list; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); old_res = kvm_num_index_msrs(kvm_fd, 0); TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); @@ -101,9 +99,7 @@ static void test_get_msr_feature(void) int res, old_res, i, kvm_fd; struct kvm_msr_list *feature_list; - kvm_fd = open(KVM_DEV_PATH, O_RDONLY); - if (kvm_fd < 0) - exit(KSFT_SKIP); + kvm_fd = open_kvm_dev_path_or_exit(); old_res = kvm_num_feature_msrs(kvm_fd, 0); TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6b4feb92dc79..6a6bc7af0e28 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -307,6 +307,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { return kvm_make_all_cpus_request_except(kvm, req, NULL); } +EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) @@ -2929,6 +2930,8 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) goto out; if (signal_pending(current)) goto out; + if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu)) + goto out; ret = 0; out: @@ -2973,8 +2976,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) goto out; } poll_end = cur = ktime_get(); - } while (single_task_running() && !need_resched() && - ktime_before(cur, stop)); + } while (kvm_vcpu_can_poll(cur, stop)); } prepare_to_rcuwait(&vcpu->wait); diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index c9bb3957f58a..28fda42e471b 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -40,21 +40,17 @@ static int __connect(struct irq_bypass_producer *prod, if (prod->add_consumer) ret = prod->add_consumer(prod, cons); - if (ret) - goto err_add_consumer; - - ret = cons->add_producer(cons, prod); - if (ret) - goto err_add_producer; + if (!ret) { + ret = cons->add_producer(cons, prod); + if (ret && prod->del_consumer) + prod->del_consumer(prod, cons); + } if (cons->start) cons->start(cons); if (prod->start) prod->start(prod); -err_add_producer: - if (prod->del_consumer) - prod->del_consumer(prod, cons); -err_add_consumer: + return ret; } |