diff options
author | Linus Torvalds | 2022-08-04 14:59:54 -0700 |
---|---|---|
committer | Linus Torvalds | 2022-08-04 14:59:54 -0700 |
commit | 7c5c3a6177fa9646884114fc7f2e970b0bc50dc9 (patch) | |
tree | 956857522574ae7cb07d2227dc16e53d7e9e00e7 /arch | |
parent | f0a892f599c46af673e47418c47c15e69a7b67f4 (diff) | |
parent | 281106f938d3daaea6f8b6723a8217a2a1ef6936 (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"Quite a large pull request due to a selftest API overhaul and some
patches that had come in too late for 5.19.
ARM:
- Unwinder implementations for both nVHE modes (classic and
protected), complete with an overflow stack
- Rework of the sysreg access from userspace, with a complete rewrite
of the vgic-v3 view to allign with the rest of the infrastructure
- Disagregation of the vcpu flags in separate sets to better track
their use model.
- A fix for the GICv2-on-v3 selftest
- A small set of cosmetic fixes
RISC-V:
- Track ISA extensions used by Guest using bitmap
- Added system instruction emulation framework
- Added CSR emulation framework
- Added gfp_custom flag in struct kvm_mmu_memory_cache
- Added G-stage ioremap() and iounmap() functions
- Added support for Svpbmt inside Guest
s390:
- add an interface to provide a hypervisor dump for secure guests
- improve selftests to use TAP interface
- enable interpretive execution of zPCI instructions (for PCI
passthrough)
- First part of deferred teardown
- CPU Topology
- PV attestation
- Minor fixes
x86:
- Permit guests to ignore single-bit ECC errors
- Intel IPI virtualization
- Allow getting/setting pending triple fault with
KVM_GET/SET_VCPU_EVENTS
- PEBS virtualization
- Simplify PMU emulation by just using PERF_TYPE_RAW events
- More accurate event reinjection on SVM (avoid retrying
instructions)
- Allow getting/setting the state of the speaker port data bit
- Refuse starting the kvm-intel module if VM-Entry/VM-Exit controls
are inconsistent
- "Notify" VM exit (detect microarchitectural hangs) for Intel
- Use try_cmpxchg64 instead of cmpxchg64
- Ignore benign host accesses to PMU MSRs when PMU is disabled
- Allow disabling KVM's "MONITOR/MWAIT are NOPs!" behavior
- Allow NX huge page mitigation to be disabled on a per-vm basis
- Port eager page splitting to shadow MMU as well
- Enable CMCI capability by default and handle injected UCNA errors
- Expose pid of vcpu threads in debugfs
- x2AVIC support for AMD
- cleanup PIO emulation
- Fixes for LLDT/LTR emulation
- Don't require refcounted "struct page" to create huge SPTEs
- Miscellaneous cleanups:
- MCE MSR emulation
- Use separate namespaces for guest PTEs and shadow PTEs bitmasks
- PIO emulation
- Reorganize rmap API, mostly around rmap destruction
- Do not workaround very old KVM bugs for L0 that runs with nesting enabled
- new selftests API for CPUID
Generic:
- Fix races in gfn->pfn cache refresh; do not pin pages tracked by
the cache
- new selftests API using struct kvm_vcpu instead of a (vm, id)
tuple"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (606 commits)
selftests: kvm: set rax before vmcall
selftests: KVM: Add exponent check for boolean stats
selftests: KVM: Provide descriptive assertions in kvm_binary_stats_test
selftests: KVM: Check stat name before other fields
KVM: x86/mmu: remove unused variable
RISC-V: KVM: Add support for Svpbmt inside Guest/VM
RISC-V: KVM: Use PAGE_KERNEL_IO in kvm_riscv_gstage_ioremap()
RISC-V: KVM: Add G-stage ioremap() and iounmap() functions
KVM: Add gfp_custom flag in struct kvm_mmu_memory_cache
RISC-V: KVM: Add extensible CSR emulation framework
RISC-V: KVM: Add extensible system instruction emulation framework
RISC-V: KVM: Factor-out instruction emulation into separate sources
RISC-V: KVM: move preempt_disable() call in kvm_arch_vcpu_ioctl_run
RISC-V: KVM: Make kvm_riscv_guest_timer_init a void function
RISC-V: KVM: Fix variable spelling mistake
RISC-V: KVM: Improve ISA extension by using a bitmap
KVM, x86/mmu: Fix the comment around kvm_tdp_mmu_zap_leafs()
KVM: SVM: Dump Virtual Machine Save Area (VMSA) to klog
KVM: x86/mmu: Treat NX as a valid SPTE bit for NPT
KVM: x86: Do not block APIC write for non ICR registers
...
Diffstat (limited to 'arch')
153 files changed, 8151 insertions, 3221 deletions
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 2e277f2ed671..53035763e48e 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -176,6 +176,22 @@ struct kvm_nvhe_init_params { unsigned long vtcr; }; +/* + * Used by the host in EL1 to dump the nVHE hypervisor backtrace on + * hyp_panic() in non-protected mode. + * + * @stack_base: hyp VA of the hyp_stack base. + * @overflow_stack_base: hyp VA of the hyp_overflow_stack base. + * @fp: hyp FP where the backtrace begins. + * @pc: hyp PC where the backtrace begins. + */ +struct kvm_nvhe_stacktrace_info { + unsigned long stack_base; + unsigned long overflow_stack_base; + unsigned long fp; + unsigned long pc; +}; + /* Translate a kernel address @ptr into its equivalent linear mapping */ #define kvm_ksym_ref(ptr) \ ({ \ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 0e66edd3aff2..9bdba47f7e14 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -473,9 +473,18 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) { - vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC; + WARN_ON(vcpu_get_flag(vcpu, PENDING_EXCEPTION)); + vcpu_set_flag(vcpu, INCREMENT_PC); } +#define kvm_pend_exception(v, e) \ + do { \ + WARN_ON(vcpu_get_flag((v), INCREMENT_PC)); \ + vcpu_set_flag((v), PENDING_EXCEPTION); \ + vcpu_set_flag((v), e); \ + } while (0) + + static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature) { return test_bit(feature, vcpu->arch.features); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index de32152cea04..f38ef299f13b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -325,8 +325,30 @@ struct kvm_vcpu_arch { /* Exception Information */ struct kvm_vcpu_fault_info fault; - /* Miscellaneous vcpu state flags */ - u64 flags; + /* Ownership of the FP regs */ + enum { + FP_STATE_FREE, + FP_STATE_HOST_OWNED, + FP_STATE_GUEST_OWNED, + } fp_state; + + /* Configuration flags, set once and for all before the vcpu can run */ + u8 cflags; + + /* Input flags to the hypervisor code, potentially cleared after use */ + u8 iflags; + + /* State flags for kernel bookkeeping, unused by the hypervisor code */ + u8 sflags; + + /* + * Don't run the guest (internal implementation need). + * + * Contrary to the flags above, this is set/cleared outside of + * a vcpu context, and thus cannot be mixed with the flags + * themselves (or the flag accesses need to be made atomic). + */ + bool pause; /* * We maintain more than a single set of debug registers to support @@ -376,9 +398,6 @@ struct kvm_vcpu_arch { /* vcpu power state */ struct kvm_mp_state mp_state; - /* Don't run the guest (internal implementation need) */ - bool pause; - /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; @@ -392,10 +411,6 @@ struct kvm_vcpu_arch { /* Additional reset state */ struct vcpu_reset_state reset_state; - /* True when deferrable sysregs are loaded on the physical CPU, - * see kvm_vcpu_load_sysregs_vhe and kvm_vcpu_put_sysregs_vhe. */ - bool sysregs_loaded_on_cpu; - /* Guest PV state */ struct { u64 last_steal; @@ -403,6 +418,124 @@ struct kvm_vcpu_arch { } steal; }; +/* + * Each 'flag' is composed of a comma-separated triplet: + * + * - the flag-set it belongs to in the vcpu->arch structure + * - the value for that flag + * - the mask for that flag + * + * __vcpu_single_flag() builds such a triplet for a single-bit flag. + * unpack_vcpu_flag() extract the flag value from the triplet for + * direct use outside of the flag accessors. + */ +#define __vcpu_single_flag(_set, _f) _set, (_f), (_f) + +#define __unpack_flag(_set, _f, _m) _f +#define unpack_vcpu_flag(...) __unpack_flag(__VA_ARGS__) + +#define __build_check_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *_fset; \ + \ + /* Check that the flags fit in the mask */ \ + BUILD_BUG_ON(HWEIGHT(m) != HWEIGHT((f) | (m))); \ + /* Check that the flags fit in the type */ \ + BUILD_BUG_ON((sizeof(*_fset) * 8) <= __fls(m)); \ + } while (0) + +#define __vcpu_get_flag(v, flagset, f, m) \ + ({ \ + __build_check_flag(v, flagset, f, m); \ + \ + v->arch.flagset & (m); \ + }) + +#define __vcpu_set_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *fset; \ + \ + __build_check_flag(v, flagset, f, m); \ + \ + fset = &v->arch.flagset; \ + if (HWEIGHT(m) > 1) \ + *fset &= ~(m); \ + *fset |= (f); \ + } while (0) + +#define __vcpu_clear_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *fset; \ + \ + __build_check_flag(v, flagset, f, m); \ + \ + fset = &v->arch.flagset; \ + *fset &= ~(m); \ + } while (0) + +#define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__) +#define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__) +#define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__) + +/* SVE exposed to guest */ +#define GUEST_HAS_SVE __vcpu_single_flag(cflags, BIT(0)) +/* SVE config completed */ +#define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1)) +/* PTRAUTH exposed to guest */ +#define GUEST_HAS_PTRAUTH __vcpu_single_flag(cflags, BIT(2)) + +/* Exception pending */ +#define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0)) +/* + * PC increment. Overlaps with EXCEPT_MASK on purpose so that it can't + * be set together with an exception... + */ +#define INCREMENT_PC __vcpu_single_flag(iflags, BIT(1)) +/* Target EL/MODE (not a single flag, but let's abuse the macro) */ +#define EXCEPT_MASK __vcpu_single_flag(iflags, GENMASK(3, 1)) + +/* Helpers to encode exceptions with minimum fuss */ +#define __EXCEPT_MASK_VAL unpack_vcpu_flag(EXCEPT_MASK) +#define __EXCEPT_SHIFT __builtin_ctzl(__EXCEPT_MASK_VAL) +#define __vcpu_except_flags(_f) iflags, (_f << __EXCEPT_SHIFT), __EXCEPT_MASK_VAL + +/* + * When PENDING_EXCEPTION is set, EXCEPT_MASK can take the following + * values: + * + * For AArch32 EL1: + */ +#define EXCEPT_AA32_UND __vcpu_except_flags(0) +#define EXCEPT_AA32_IABT __vcpu_except_flags(1) +#define EXCEPT_AA32_DABT __vcpu_except_flags(2) +/* For AArch64: */ +#define EXCEPT_AA64_EL1_SYNC __vcpu_except_flags(0) +#define EXCEPT_AA64_EL1_IRQ __vcpu_except_flags(1) +#define EXCEPT_AA64_EL1_FIQ __vcpu_except_flags(2) +#define EXCEPT_AA64_EL1_SERR __vcpu_except_flags(3) +/* For AArch64 with NV (one day): */ +#define EXCEPT_AA64_EL2_SYNC __vcpu_except_flags(4) +#define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5) +#define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6) +#define EXCEPT_AA64_EL2_SERR __vcpu_except_flags(7) +/* Guest debug is live */ +#define DEBUG_DIRTY __vcpu_single_flag(iflags, BIT(4)) +/* Save SPE context if active */ +#define DEBUG_STATE_SAVE_SPE __vcpu_single_flag(iflags, BIT(5)) +/* Save TRBE context if active */ +#define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) + +/* SVE enabled for host EL0 */ +#define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) +/* SME enabled for EL0 */ +#define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) +/* Physical CPU not in supported_cpus */ +#define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2)) +/* WFIT instruction trapped */ +#define IN_WFIT __vcpu_single_flag(sflags, BIT(3)) +/* vcpu system registers loaded on physical CPU */ +#define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) + /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ sve_ffr_offset((vcpu)->arch.sve_max_vl)) @@ -423,70 +556,31 @@ struct kvm_vcpu_arch { __size_ret; \ }) -/* vcpu_arch flags field values: */ -#define KVM_ARM64_DEBUG_DIRTY (1 << 0) -#define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */ -#define KVM_ARM64_FP_HOST (1 << 2) /* host FP regs loaded */ -#define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ -#define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */ -#define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */ -#define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */ -#define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */ -/* - * Overlaps with KVM_ARM64_EXCEPT_MASK on purpose so that it can't be - * set together with an exception... - */ -#define KVM_ARM64_INCREMENT_PC (1 << 9) /* Increment PC */ -#define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */ -/* - * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can - * take the following values: - * - * For AArch32 EL1: - */ -#define KVM_ARM64_EXCEPT_AA32_UND (0 << 9) -#define KVM_ARM64_EXCEPT_AA32_IABT (1 << 9) -#define KVM_ARM64_EXCEPT_AA32_DABT (2 << 9) -/* For AArch64: */ -#define KVM_ARM64_EXCEPT_AA64_ELx_SYNC (0 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_IRQ (1 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_FIQ (2 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_SERR (3 << 9) -#define KVM_ARM64_EXCEPT_AA64_EL1 (0 << 11) -#define KVM_ARM64_EXCEPT_AA64_EL2 (1 << 11) - -#define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active */ -#define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */ -#define KVM_ARM64_FP_FOREIGN_FPSTATE (1 << 14) -#define KVM_ARM64_ON_UNSUPPORTED_CPU (1 << 15) /* Physical CPU not in supported_cpus */ -#define KVM_ARM64_HOST_SME_ENABLED (1 << 16) /* SME enabled for EL0 */ -#define KVM_ARM64_WFIT (1 << 17) /* WFIT instruction trapped */ - #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_USE_HW | \ KVM_GUESTDBG_SINGLESTEP) #define vcpu_has_sve(vcpu) (system_supports_sve() && \ - ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE)) + vcpu_get_flag(vcpu, GUEST_HAS_SVE)) #ifdef CONFIG_ARM64_PTR_AUTH #define vcpu_has_ptrauth(vcpu) \ ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || \ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && \ - (vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH) + vcpu_get_flag(vcpu, GUEST_HAS_PTRAUTH)) #else #define vcpu_has_ptrauth(vcpu) false #endif #define vcpu_on_unsupported_cpu(vcpu) \ - ((vcpu)->arch.flags & KVM_ARM64_ON_UNSUPPORTED_CPU) + vcpu_get_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_set_on_unsupported_cpu(vcpu) \ - ((vcpu)->arch.flags |= KVM_ARM64_ON_UNSUPPORTED_CPU) + vcpu_set_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_clear_on_unsupported_cpu(vcpu) \ - ((vcpu)->arch.flags &= ~KVM_ARM64_ON_UNSUPPORTED_CPU) + vcpu_clear_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_gp_regs(v) (&(v)->arch.ctxt.regs) @@ -620,8 +714,6 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu); int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); -int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); -int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); @@ -831,8 +923,7 @@ void kvm_init_protected_traps(struct kvm_vcpu *vcpu); int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); -#define kvm_arm_vcpu_sve_finalized(vcpu) \ - ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED) +#define kvm_arm_vcpu_sve_finalized(vcpu) vcpu_get_flag(vcpu, VCPU_SVE_FINALIZED) #define kvm_has_mte(kvm) \ (system_supports_mte() && \ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 227d256cd4b9..4e4c171f070b 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -114,6 +114,14 @@ #define OVERFLOW_STACK_SIZE SZ_4K /* + * With the minimum frame size of [x29, x30], exactly half the combined + * sizes of the hyp and overflow stacks is the maximum size needed to + * save the unwinded stacktrace; plus an additional entry to delimit the + * end. + */ +#define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + PAGE_SIZE) / 2 + sizeof(long)) + +/* * Alignment of kernel segments (e.g. .text, .data). * * 4 KB granule: 16 level 3 entries, with contiguous bit diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index aec9315bf156..6ebdcdff77f5 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -8,52 +8,20 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> -#include <linux/types.h> #include <linux/llist.h> #include <asm/memory.h> +#include <asm/pointer_auth.h> #include <asm/ptrace.h> #include <asm/sdei.h> -enum stack_type { - STACK_TYPE_UNKNOWN, - STACK_TYPE_TASK, - STACK_TYPE_IRQ, - STACK_TYPE_OVERFLOW, - STACK_TYPE_SDEI_NORMAL, - STACK_TYPE_SDEI_CRITICAL, - __NR_STACK_TYPES -}; - -struct stack_info { - unsigned long low; - unsigned long high; - enum stack_type type; -}; +#include <asm/stacktrace/common.h> extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); -static inline bool on_stack(unsigned long sp, unsigned long size, - unsigned long low, unsigned long high, - enum stack_type type, struct stack_info *info) -{ - if (!low) - return false; - - if (sp < low || sp + size < sp || sp + size > high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = type; - } - return true; -} - static inline bool on_irq_stack(unsigned long sp, unsigned long size, struct stack_info *info) { @@ -89,30 +57,4 @@ static inline bool on_overflow_stack(unsigned long sp, unsigned long size, struct stack_info *info) { return false; } #endif - -/* - * We can only safely access per-cpu stacks from current in a non-preemptible - * context. - */ -static inline bool on_accessible_stack(const struct task_struct *tsk, - unsigned long sp, unsigned long size, - struct stack_info *info) -{ - if (info) - info->type = STACK_TYPE_UNKNOWN; - - if (on_task_stack(tsk, sp, size, info)) - return true; - if (tsk != current || preemptible()) - return false; - if (on_irq_stack(sp, size, info)) - return true; - if (on_overflow_stack(sp, size, info)) - return true; - if (on_sdei_stack(sp, size, info)) - return true; - - return false; -} - #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h new file mode 100644 index 000000000000..f58eb944c46f --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common arm64 stack unwinder code. + * + * To implement a new arm64 stack unwinder: + * 1) Include this header + * + * 2) Call into unwind_next_common() from your top level unwind + * function, passing it the validation and translation callbacks + * (though the later can be NULL if no translation is required). + * + * See: arch/arm64/kernel/stacktrace.c for the reference implementation. + * + * Copyright (C) 2012 ARM Ltd. + */ +#ifndef __ASM_STACKTRACE_COMMON_H +#define __ASM_STACKTRACE_COMMON_H + +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/kprobes.h> +#include <linux/types.h> + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_OVERFLOW, + STACK_TYPE_SDEI_NORMAL, + STACK_TYPE_SDEI_CRITICAL, + STACK_TYPE_HYP, + __NR_STACK_TYPES +}; + +struct stack_info { + unsigned long low; + unsigned long high; + enum stack_type type; +}; + +/* + * A snapshot of a frame record or fp/lr register values, along with some + * accounting information necessary for robust unwinding. + * + * @fp: The fp value in the frame record (or the real fp) + * @pc: The lr value in the frame record (or the real lr) + * + * @stacks_done: Stacks which have been entirely unwound, for which it is no + * longer valid to unwind to. + * + * @prev_fp: The fp that pointed to this frame record, or a synthetic value + * of 0. This is used to ensure that within a stack, each + * subsequent frame record is at an increasing address. + * @prev_type: The type of stack this frame record was on, or a synthetic + * value of STACK_TYPE_UNKNOWN. This is used to detect a + * transition from one stack to another. + * + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement lr + * value. + * + * @task: The task being unwound. + */ +struct unwind_state { + unsigned long fp; + unsigned long pc; + DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); + unsigned long prev_fp; + enum stack_type prev_type; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif + struct task_struct *task; +}; + +static inline bool on_stack(unsigned long sp, unsigned long size, + unsigned long low, unsigned long high, + enum stack_type type, struct stack_info *info) +{ + if (!low) + return false; + + if (sp < low || sp + size < sp || sp + size > high) + return false; + + if (info) { + info->low = low; + info->high = high; + info->type = type; + } + return true; +} + +static inline void unwind_init_common(struct unwind_state *state, + struct task_struct *task) +{ + state->task = task; +#ifdef CONFIG_KRETPROBES + state->kr_cur = NULL; +#endif + + /* + * Prime the first unwind. + * + * In unwind_next() we'll check that the FP points to a valid stack, + * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be + * treated as a transition to whichever stack that happens to be. The + * prev_fp value won't be used, but we set it to 0 such that it is + * definitely not an accessible stack address. + */ + bitmap_zero(state->stacks_done, __NR_STACK_TYPES); + state->prev_fp = 0; + state->prev_type = STACK_TYPE_UNKNOWN; +} + +/* + * stack_trace_translate_fp_fn() - Translates a non-kernel frame pointer to + * a kernel address. + * + * @fp: the frame pointer to be updated to its kernel address. + * @type: the stack type associated with frame pointer @fp + * + * Returns true and success and @fp is updated to the corresponding + * kernel virtual address; otherwise returns false. + */ +typedef bool (*stack_trace_translate_fp_fn)(unsigned long *fp, + enum stack_type type); + +/* + * on_accessible_stack_fn() - Check whether a stack range is on any + * of the possible stacks. + * + * @tsk: task whose stack is being unwound + * @sp: stack address being checked + * @size: size of the stack range being checked + * @info: stack unwinding context + */ +typedef bool (*on_accessible_stack_fn)(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info); + +static inline int unwind_next_common(struct unwind_state *state, + struct stack_info *info, + on_accessible_stack_fn accessible, + stack_trace_translate_fp_fn translate_fp) +{ + unsigned long fp = state->fp, kern_fp = fp; + struct task_struct *tsk = state->task; + + if (fp & 0x7) + return -EINVAL; + + if (!accessible(tsk, fp, 16, info)) + return -EINVAL; + + if (test_bit(info->type, state->stacks_done)) + return -EINVAL; + + /* + * If fp is not from the current address space perform the necessary + * translation before dereferencing it to get the next fp. + */ + if (translate_fp && !translate_fp(&kern_fp, info->type)) + return -EINVAL; + + /* + * As stacks grow downward, any valid record on the same stack must be + * at a strictly higher address than the prior record. + * + * Stacks can nest in several valid orders, e.g. + * + * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL + * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW + * HYP -> OVERFLOW + * + * ... but the nesting itself is strict. Once we transition from one + * stack to another, it's never valid to unwind back to that first + * stack. + */ + if (info->type == state->prev_type) { + if (fp <= state->prev_fp) + return -EINVAL; + } else { + __set_bit(state->prev_type, state->stacks_done); + } + + /* + * Record this frame record's values and location. The prev_fp and + * prev_type are only meaningful to the next unwind_next() invocation. + */ + state->fp = READ_ONCE(*(unsigned long *)(kern_fp)); + state->pc = READ_ONCE(*(unsigned long *)(kern_fp + 8)); + state->prev_fp = fp; + state->prev_type = info->type; + + return 0; +} + +#endif /* __ASM_STACKTRACE_COMMON_H */ diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h new file mode 100644 index 000000000000..d5527b600390 --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * + * Copyright (C) 2022 Google LLC + */ +#ifndef __ASM_STACKTRACE_NVHE_H +#define __ASM_STACKTRACE_NVHE_H + +#include <asm/stacktrace/common.h> + +/* + * kvm_nvhe_unwind_init - Start an unwind from the given nVHE HYP fp and pc + * + * @state : unwind_state to initialize + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + */ +static inline void kvm_nvhe_unwind_init(struct unwind_state *state, + unsigned long fp, + unsigned long pc) +{ + unwind_init_common(state, NULL); + + state->fp = fp; + state->pc = pc; +} + +#ifndef __KVM_NVHE_HYPERVISOR__ +/* + * Conventional (non-protected) nVHE HYP stack unwinder + * + * In non-protected mode, the unwinding is done from kernel proper context + * (by the host in EL1). + */ + +DECLARE_KVM_NVHE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); +DECLARE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); + +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset); + +#endif /* __KVM_NVHE_HYPERVISOR__ */ +#endif /* __ASM_STACKTRACE_NVHE_H */ diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index fcaa151b81f1..ce190ee18a20 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -7,75 +7,16 @@ #include <linux/kernel.h> #include <linux/export.h> #include <linux/ftrace.h> -#include <linux/kprobes.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/stacktrace.h> #include <asm/irq.h> -#include <asm/pointer_auth.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> /* - * A snapshot of a frame record or fp/lr register values, along with some - * accounting information necessary for robust unwinding. - * - * @fp: The fp value in the frame record (or the real fp) - * @pc: The lr value in the frame record (or the real lr) - * - * @stacks_done: Stacks which have been entirely unwound, for which it is no - * longer valid to unwind to. - * - * @prev_fp: The fp that pointed to this frame record, or a synthetic value - * of 0. This is used to ensure that within a stack, each - * subsequent frame record is at an increasing address. - * @prev_type: The type of stack this frame record was on, or a synthetic - * value of STACK_TYPE_UNKNOWN. This is used to detect a - * transition from one stack to another. - * - * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance - * associated with the most recently encountered replacement lr - * value. - * - * @task: The task being unwound. - */ -struct unwind_state { - unsigned long fp; - unsigned long pc; - DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); - unsigned long prev_fp; - enum stack_type prev_type; -#ifdef CONFIG_KRETPROBES - struct llist_node *kr_cur; -#endif - struct task_struct *task; -}; - -static void unwind_init_common(struct unwind_state *state, - struct task_struct *task) -{ - state->task = task; -#ifdef CONFIG_KRETPROBES - state->kr_cur = NULL; -#endif - - /* - * Prime the first unwind. - * - * In unwind_next() we'll check that the FP points to a valid stack, - * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be - * treated as a transition to whichever stack that happens to be. The - * prev_fp value won't be used, but we set it to 0 such that it is - * definitely not an accessible stack address. - */ - bitmap_zero(state->stacks_done, __NR_STACK_TYPES); - state->prev_fp = 0; - state->prev_type = STACK_TYPE_UNKNOWN; -} - -/* * Start an unwind from a pt_regs. * * The unwind will begin at the PC within the regs. @@ -127,6 +68,31 @@ static inline void unwind_init_from_task(struct unwind_state *state, } /* + * We can only safely access per-cpu stacks from current in a non-preemptible + * context. + */ +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + if (on_task_stack(tsk, sp, size, info)) + return true; + if (tsk != current || preemptible()) + return false; + if (on_irq_stack(sp, size, info)) + return true; + if (on_overflow_stack(sp, size, info)) + return true; + if (on_sdei_stack(sp, size, info)) + return true; + + return false; +} + +/* * Unwind from one frame record (A) to the next frame record (B). * * We terminate early if the location of B indicates a malformed chain of frame @@ -138,48 +104,15 @@ static int notrace unwind_next(struct unwind_state *state) struct task_struct *tsk = state->task; unsigned long fp = state->fp; struct stack_info info; + int err; /* Final frame; nothing to unwind */ if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) return -ENOENT; - if (fp & 0x7) - return -EINVAL; - - if (!on_accessible_stack(tsk, fp, 16, &info)) - return -EINVAL; - - if (test_bit(info.type, state->stacks_done)) - return -EINVAL; - - /* - * As stacks grow downward, any valid record on the same stack must be - * at a strictly higher address than the prior record. - * - * Stacks can nest in several valid orders, e.g. - * - * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL - * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW - * - * ... but the nesting itself is strict. Once we transition from one - * stack to another, it's never valid to unwind back to that first - * stack. - */ - if (info.type == state->prev_type) { - if (fp <= state->prev_fp) - return -EINVAL; - } else { - __set_bit(state->prev_type, state->stacks_done); - } - - /* - * Record this frame record's values and location. The prev_fp and - * prev_type are only meaningful to the next unwind_next() invocation. - */ - state->fp = READ_ONCE(*(unsigned long *)(fp)); - state->pc = READ_ONCE(*(unsigned long *)(fp + 8)); - state->prev_fp = fp; - state->prev_type = info.type; + err = unwind_next_common(state, &info, on_accessible_stack, NULL); + if (err) + return err; state->pc = ptrauth_strip_insn_pac(state->pc); diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8a5fbbf084df..815cc118c675 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -56,4 +56,17 @@ config NVHE_EL2_DEBUG If unsure, say N. +config PROTECTED_NVHE_STACKTRACE + bool "Protected KVM hypervisor stacktraces" + depends on NVHE_EL2_DEBUG + default n + help + Say Y here to enable pKVM hypervisor stacktraces on hyp_panic() + + If using protected nVHE mode, but cannot afford the associated + memory cost (less than 0.75 page per CPU) of pKVM stacktraces, + say N. + + If unsure, or not using protected nVHE (pKVM), say N. + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index aa127ae9f675..5e33c2d4645a 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_KVM) += hyp/ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ - guest.o debug.o reset.o sys_regs.o \ + guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ arch_timer.o trng.o vmid.o \ vgic/vgic.o vgic/vgic-init.o \ diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 3b8d062e30ea..bb24a76b4224 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -242,7 +242,7 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu) { return (cpus_have_final_cap(ARM64_HAS_WFXT) && - (vcpu->arch.flags & KVM_ARM64_WFIT)); + vcpu_get_flag(vcpu, IN_WFIT)); } static u64 wfit_delay_ns(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 83a7f61354d3..986cee6fbc7f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -49,7 +49,7 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); -static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); @@ -330,6 +330,12 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; + /* + * Default value for the FP state, will be overloaded at load + * time if we support FP (pretty likely) + */ + vcpu->arch.fp_state = FP_STATE_FREE; + /* Set up the timer */ kvm_timer_vcpu_init(vcpu); @@ -659,7 +665,7 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) preempt_enable(); kvm_vcpu_halt(vcpu); - vcpu->arch.flags &= ~KVM_ARM64_WFIT; + vcpu_clear_flag(vcpu, IN_WFIT); kvm_clear_request(KVM_REQ_UNHALT, vcpu); preempt_disable(); @@ -1015,8 +1021,8 @@ out: * the vcpu state. Note that this relies on __kvm_adjust_pc() * being preempt-safe on VHE. */ - if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION | - KVM_ARM64_INCREMENT_PC))) + if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) || + vcpu_get_flag(vcpu, INCREMENT_PC))) kvm_call_hyp(__kvm_adjust_pc, vcpu); vcpu_put(vcpu); @@ -1414,18 +1420,11 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { - unsigned long dev_id, type; - - dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >> - KVM_ARM_DEVICE_ID_SHIFT; - type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >> - KVM_ARM_DEVICE_TYPE_SHIFT; - - switch (dev_id) { + switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) { case KVM_ARM_DEVICE_VGIC_V2: if (!vgic_present) return -ENXIO; - return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); + return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr); default: return -ENODEV; } diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 4fd5c216c4bb..0b28d7db7c76 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -104,11 +104,11 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) * Trap debug register access when one of the following is true: * - Userspace is using the hardware to debug the guest * (KVM_GUESTDBG_USE_HW is set). - * - The guest is not using debug (KVM_ARM64_DEBUG_DIRTY is clear). + * - The guest is not using debug (DEBUG_DIRTY clear). * - The guest has enabled the OS Lock (debug exceptions are blocked). */ if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) || - !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY) || + !vcpu_get_flag(vcpu, DEBUG_DIRTY) || kvm_vcpu_os_lock_enabled(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; @@ -147,8 +147,8 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) * debug related registers. * * Additionally, KVM only traps guest accesses to the debug registers if - * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY - * flag on vcpu->arch.flags). Since the guest must not interfere + * the guest is not actively using them (see the DEBUG_DIRTY + * flag on vcpu->arch.iflags). Since the guest must not interfere * with the hardware state when debugging the guest, we must ensure that * trapping is enabled whenever we are debugging the guest using the * debug registers. @@ -205,9 +205,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) * * We simply switch the debug_ptr to point to our new * external_debug_state which has been populated by the - * debug ioctl. The existing KVM_ARM64_DEBUG_DIRTY - * mechanism ensures the registers are updated on the - * world switch. + * debug ioctl. The existing DEBUG_DIRTY mechanism ensures + * the registers are updated on the world switch. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { /* Enable breakpoints/watchpoints */ @@ -216,7 +215,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); trace_kvm_arm_set_regset("BKPTS", get_num_brps(), &vcpu->arch.debug_ptr->dbg_bcr[0], @@ -246,7 +245,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) /* If KDE or MDE are set, perform a full save/restore cycle. */ if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); /* Write mdcr_el2 changes since vcpu_load on VHE systems */ if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2) @@ -298,16 +297,16 @@ void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu) */ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_PMSVER_SHIFT) && !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT))) - vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_SPE; + vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_SPE); /* Check if we have TRBE implemented and available at the host */ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRBE_SHIFT) && !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG)) - vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_TRBE; + vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_TRBE); } void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) { - vcpu->arch.flags &= ~(KVM_ARM64_DEBUG_STATE_SAVE_SPE | - KVM_ARM64_DEBUG_STATE_SAVE_TRBE); + vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE); + vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE); } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 6012b08ecb14..ec8e4494873d 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -77,12 +77,14 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) BUG_ON(!current->mm); BUG_ON(test_thread_flag(TIF_SVE)); - vcpu->arch.flags &= ~KVM_ARM64_FP_ENABLED; - vcpu->arch.flags |= KVM_ARM64_FP_HOST; + if (!system_supports_fpsimd()) + return; + + vcpu->arch.fp_state = FP_STATE_HOST_OWNED; - vcpu->arch.flags &= ~KVM_ARM64_HOST_SVE_ENABLED; + vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED; + vcpu_set_flag(vcpu, HOST_SVE_ENABLED); /* * We don't currently support SME guests but if we leave @@ -94,29 +96,28 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) * operations. Do this for ZA as well for now for simplicity. */ if (system_supports_sme()) { - vcpu->arch.flags &= ~KVM_ARM64_HOST_SME_ENABLED; + vcpu_clear_flag(vcpu, HOST_SME_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) - vcpu->arch.flags |= KVM_ARM64_HOST_SME_ENABLED; + vcpu_set_flag(vcpu, HOST_SME_ENABLED); - if (read_sysreg_s(SYS_SVCR) & - (SVCR_SM_MASK | SVCR_ZA_MASK)) { - vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; + if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { + vcpu->arch.fp_state = FP_STATE_FREE; fpsimd_save_and_flush_cpu_state(); } } } /* - * Called just before entering the guest once we are no longer - * preemptable. Syncs the host's TIF_FOREIGN_FPSTATE with the KVM - * mirror of the flag used by the hypervisor. + * Called just before entering the guest once we are no longer preemptable + * and interrupts are disabled. If we have managed to run anything using + * FP while we were preemptible (such as off the back of an interrupt), + * then neither the host nor the guest own the FP hardware (and it was the + * responsibility of the code that used FP to save the existing state). */ void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) { if (test_thread_flag(TIF_FOREIGN_FPSTATE)) - vcpu->arch.flags |= KVM_ARM64_FP_FOREIGN_FPSTATE; - else - vcpu->arch.flags &= ~KVM_ARM64_FP_FOREIGN_FPSTATE; + vcpu->arch.fp_state = FP_STATE_FREE; } /* @@ -130,7 +131,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(!irqs_disabled()); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { /* * Currently we do not support SME guests so SVCR is * always 0 and we just need a variable to point to. @@ -163,7 +164,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) */ if (has_vhe() && system_supports_sme()) { /* Also restore EL0 state seen on entry */ - if (vcpu->arch.flags & KVM_ARM64_HOST_SME_ENABLED) + if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); @@ -173,7 +174,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) CPACR_EL1_SMEN_EL1EN); } - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { if (vcpu_has_sve(vcpu)) { __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); @@ -192,7 +193,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) * for EL0. To avoid spurious traps, restore the trap state * seen by kvm_arch_vcpu_load_fp(): */ - if (vcpu->arch.flags & KVM_ARM64_HOST_SVE_ENABLED) + if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED)) sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); else sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index f66c0142b335..bbe5b393d689 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -17,6 +17,7 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> #include <asm/debug-monitors.h> +#include <asm/stacktrace/nvhe.h> #include <asm/traps.h> #include <kvm/arm_hypercalls.h> @@ -120,7 +121,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); } else { if (esr & ESR_ELx_WFx_ISS_WFxT) - vcpu->arch.flags |= KVM_ARM64_WFIT; + vcpu_set_flag(vcpu, IN_WFIT); kvm_vcpu_wfi(vcpu); } @@ -347,12 +348,15 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr, - (void *)panic_addr); + (void *)(panic_addr + kaslr_offset())); } else { kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr, - (void *)panic_addr); + (void *)(panic_addr + kaslr_offset())); } + /* Dump the nVHE hypervisor backtrace */ + kvm_nvhe_dump_backtrace(hyp_offset); + /* * Hyp has panicked and we're going to handle that by panicking the * kernel. The kernel offset will be revealed in the panic so we're diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index c5d009715402..b7557b25ed56 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -303,14 +303,14 @@ static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { if (vcpu_el1_is_32bit(vcpu)) { - switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { - case KVM_ARM64_EXCEPT_AA32_UND: + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA32_UND): enter_exception32(vcpu, PSR_AA32_MODE_UND, 4); break; - case KVM_ARM64_EXCEPT_AA32_IABT: + case unpack_vcpu_flag(EXCEPT_AA32_IABT): enter_exception32(vcpu, PSR_AA32_MODE_ABT, 12); break; - case KVM_ARM64_EXCEPT_AA32_DABT: + case unpack_vcpu_flag(EXCEPT_AA32_DABT): enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16); break; default: @@ -318,9 +318,8 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) break; } } else { - switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { - case (KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_EXCEPT_AA64_EL1): + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); break; default: @@ -340,12 +339,12 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) */ void __kvm_adjust_pc(struct kvm_vcpu *vcpu) { - if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { + if (vcpu_get_flag(vcpu, PENDING_EXCEPTION)) { kvm_inject_exception(vcpu); - vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | - KVM_ARM64_EXCEPT_MASK); - } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { + vcpu_clear_flag(vcpu, PENDING_EXCEPTION); + vcpu_clear_flag(vcpu, EXCEPT_MASK); + } else if (vcpu_get_flag(vcpu, INCREMENT_PC)) { kvm_skip_instr(vcpu); - vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; + vcpu_clear_flag(vcpu, INCREMENT_PC); } } diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 4ebe9f558f3a..961bbef104a6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -132,7 +132,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) + if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) return; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; @@ -151,7 +151,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) + if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) return; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; @@ -162,7 +162,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - vcpu->arch.flags &= ~KVM_ARM64_DEBUG_DIRTY; + vcpu_clear_flag(vcpu, DEBUG_DIRTY); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 37d9f211c200..6cbbb6c02f66 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -37,22 +37,10 @@ struct kvm_exception_table_entry { extern struct kvm_exception_table_entry __start___kvm_ex_table; extern struct kvm_exception_table_entry __stop___kvm_ex_table; -/* Check whether the FP regs were dirtied while in the host-side run loop: */ -static inline bool update_fp_enabled(struct kvm_vcpu *vcpu) +/* Check whether the FP regs are owned by the guest */ +static inline bool guest_owns_fp_regs(struct kvm_vcpu *vcpu) { - /* - * When the system doesn't support FP/SIMD, we cannot rely on - * the _TIF_FOREIGN_FPSTATE flag. However, we always inject an - * abort on the very first access to FP and thus we should never - * see KVM_ARM64_FP_ENABLED. For added safety, make sure we always - * trap the accesses. - */ - if (!system_supports_fpsimd() || - vcpu->arch.flags & KVM_ARM64_FP_FOREIGN_FPSTATE) - vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED | - KVM_ARM64_FP_HOST); - - return !!(vcpu->arch.flags & KVM_ARM64_FP_ENABLED); + return vcpu->arch.fp_state == FP_STATE_GUEST_OWNED; } /* Save the 32-bit only FPSIMD system register state */ @@ -191,10 +179,8 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) isb(); /* Write out the host state if it's in the registers */ - if (vcpu->arch.flags & KVM_ARM64_FP_HOST) { + if (vcpu->arch.fp_state == FP_STATE_HOST_OWNED) __fpsimd_save_state(vcpu->arch.host_fpsimd_state); - vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; - } /* Restore the guest state */ if (sve_guest) @@ -206,7 +192,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) if (!(read_sysreg(hcr_el2) & HCR_RW)) write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2); - vcpu->arch.flags |= KVM_ARM64_FP_ENABLED; + vcpu->arch.fp_state = FP_STATE_GUEST_OWNED; return true; } diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 7ecca8b07851..baa5b9b3dde5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -195,7 +195,7 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); - if (has_vhe() || vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY) + if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); } @@ -212,7 +212,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); - if (has_vhe() || vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY) + if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); } diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index f9fe4dc21b1f..ed5d222e2826 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -12,13 +12,13 @@ HOST_EXTRACFLAGS += -I$(objtree)/include lib-objs := clear_page.o copy_page.o memcpy.o memset.o lib-objs := $(addprefix ../../../lib/, $(lib-objs)) -obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ +hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o -obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o +hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o -obj-$(CONFIG_DEBUG_LIST) += list_debug.o -obj-y += $(lib-objs) +hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o +hyp-obj-y += $(lib-objs) ## ## Build rules for compiling nVHE hyp code @@ -26,9 +26,9 @@ obj-y += $(lib-objs) ## file containing all nVHE hyp code and data. ## -hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y)) +hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y)) obj-y := kvm_nvhe.o -extra-y := $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o +targets += $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o # 1) Compile all source files to `.nvhe.o` object files. The file extension # avoids file name clashes for files shared with VHE. diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index df361d839902..e17455773b98 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -84,10 +84,10 @@ static void __debug_restore_trace(u64 trfcr_el1) void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); /* Disable and flush Self-Hosted Trace generation */ - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1); } @@ -98,9 +98,9 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index ea6a397b64a6..b6c0188c4b35 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -177,13 +177,8 @@ SYM_FUNC_END(__host_hvc) b hyp_panic .L__hyp_sp_overflow\@: - /* - * Reset SP to the top of the stack, to allow handling the hyp_panic. - * This corrupts the stack but is ok, since we won't be attempting - * any unwinding here. - */ - ldr_this_cpu x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1 - mov sp, x0 + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 b hyp_panic_bad_stack ASM_BUG() diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c new file mode 100644 index 000000000000..58f645ad66bc --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM nVHE hypervisor stack tracing support. + * + * Copyright (C) 2022 Google LLC + */ +#include <asm/kvm_asm.h> +#include <asm/kvm_hyp.h> +#include <asm/memory.h> +#include <asm/percpu.h> + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +DEFINE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); + +/* + * hyp_prepare_backtrace - Prepare non-protected nVHE backtrace. + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the information needed by the host to unwind the non-protected + * nVHE hypervisor stack in EL1. + */ +static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE); + stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); + stacktrace_info->fp = fp; + stacktrace_info->pc = pc; +} + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +#include <asm/stacktrace/nvhe.h> + +DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); + +static bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); +} + +static bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + unsigned long high = params->stack_hyp_va; + unsigned long low = high - PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); +} + +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + return (on_overflow_stack(sp, size, info) || + on_hyp_stack(sp, size, info)); +} + +static int unwind_next(struct unwind_state *state) +{ + struct stack_info info; + + return unwind_next_common(state, &info, on_accessible_stack, NULL); +} + +static void notrace unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, + void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + +/* + * pkvm_save_backtrace_entry - Saves a protected nVHE HYP stacktrace entry + * + * @arg : index of the entry in the stacktrace buffer + * @where : the program counter corresponding to the stack frame + * + * Save the return address of a stack frame to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static bool pkvm_save_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long *stacktrace = this_cpu_ptr(pkvm_stacktrace); + int *idx = (int *)arg; + + /* + * Need 2 free slots: 1 for current entry and 1 for the + * delimiter. + */ + if (*idx > ARRAY_SIZE(pkvm_stacktrace) - 2) + return false; + + stacktrace[*idx] = where; + stacktrace[++*idx] = 0UL; + + return true; +} + +/* + * pkvm_save_backtrace - Saves the protected nVHE HYP stacktrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the unwinded stack addresses to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ + struct unwind_state state; + int idx = 0; + + kvm_nvhe_unwind_init(&state, fp, pc); + + unwind(&state, pkvm_save_backtrace_entry, &idx); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +/* + * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Saves the information needed by the host to dump the nVHE hypervisor + * backtrace. + */ +void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + if (is_protected_kvm_enabled()) + pkvm_save_backtrace(fp, pc); + else + hyp_prepare_backtrace(fp, pc); +} diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6db801db8f27..9f6385702061 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -34,6 +34,8 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); +extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -43,7 +45,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val = vcpu->arch.cptr_el2; val |= CPTR_EL2_TTA | CPTR_EL2_TAM; - if (!update_fp_enabled(vcpu)) { + if (!guest_owns_fp_regs(vcpu)) { val |= CPTR_EL2_TFP | CPTR_EL2_TZ; __activate_traps_fpsimd32(vcpu); } @@ -123,7 +125,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) } cptr = CPTR_EL2_DEFAULT; - if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED)) + if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)) cptr |= CPTR_EL2_TZ; if (cpus_have_final_cap(ARM64_SME)) cptr &= ~CPTR_EL2_TSM; @@ -335,7 +337,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg_restore_state_nvhe(host_ctxt); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); @@ -375,6 +377,10 @@ asmlinkage void __noreturn hyp_panic(void) __sysreg_restore_state_nvhe(host_ctxt); } + /* Prepare to dump kvm nvhe hyp stacktrace */ + kvm_nvhe_prepare_backtrace((unsigned long)__builtin_frame_address(0), + _THIS_IP_); + __hyp_do_panic(host_ctxt, spsr, elr, par); unreachable(); } @@ -386,5 +392,5 @@ asmlinkage void __noreturn hyp_panic_bad_stack(void) asmlinkage void kvm_unexpected_el2_exception(void) { - return __kvm_unexpected_el2_exception(); + __kvm_unexpected_el2_exception(); } diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 6b94c3e6ff26..e20fa4475dac 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -38,9 +38,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | - KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); __kvm_adjust_pc(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 969f20daf97a..7acb87eaa092 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -55,7 +55,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val |= CPTR_EL2_TAM; - if (update_fp_enabled(vcpu)) { + if (guest_owns_fp_regs(vcpu)) { if (vcpu_has_sve(vcpu)) val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; } else { @@ -175,7 +175,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_restore_host_state_vhe(host_ctxt); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); @@ -249,5 +249,5 @@ void __noreturn hyp_panic(void) asmlinkage void kvm_unexpected_el2_exception(void) { - return __kvm_unexpected_el2_exception(); + __kvm_unexpected_el2_exception(); } diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 007a12dd4351..7b44f6b3b547 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -79,7 +79,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) __sysreg_restore_user_state(guest_ctxt); __sysreg_restore_el1_state(guest_ctxt); - vcpu->arch.sysregs_loaded_on_cpu = true; + vcpu_set_flag(vcpu, SYSREGS_ON_CPU); activate_traps_vhe_load(vcpu); } @@ -110,5 +110,5 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); - vcpu->arch.sysregs_loaded_on_cpu = false; + vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); } diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 55a5dbe957e0..f32f4a2a347f 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -20,9 +20,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr bool is_aarch32 = vcpu_mode_is_32bit(vcpu); u64 esr = 0; - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | - KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); vcpu_write_sys_reg(vcpu, addr, FAR_EL1); @@ -52,9 +50,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) { u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | - KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); /* * Build an unknown exception, depending on the instruction @@ -73,8 +69,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) static void inject_undef32(struct kvm_vcpu *vcpu) { - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_UND | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA32_UND); } /* @@ -97,14 +92,12 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) far = vcpu_read_sys_reg(vcpu, FAR_EL1); if (is_pabt) { - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IABT | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA32_IABT); far &= GENMASK(31, 0); far |= (u64)addr << 32; vcpu_write_sys_reg(vcpu, fsr, IFSR32_EL2); } else { /* !iabt */ - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_DABT | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA32_DABT); far &= GENMASK(63, 32); far |= addr; vcpu_write_sys_reg(vcpu, fsr, ESR_EL1); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index f5651a05b6a8..87f1cd0df36e 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -786,7 +786,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, { phys_addr_t addr; int ret = 0; - struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; + struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_R | diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 6c70c6f61c70..0e08fbe68715 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -81,7 +81,7 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until * kvm_arm_vcpu_finalize(), which freezes the configuration. */ - vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE; + vcpu_set_flag(vcpu, GUEST_HAS_SVE); return 0; } @@ -120,7 +120,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) } vcpu->arch.sve_state = buf; - vcpu->arch.flags |= KVM_ARM64_VCPU_SVE_FINALIZED; + vcpu_set_flag(vcpu, VCPU_SVE_FINALIZED); return 0; } @@ -177,7 +177,7 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) !system_has_full_ptr_auth()) return -EINVAL; - vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_PTRAUTH; + vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); return 0; } diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c new file mode 100644 index 000000000000..949d19d603fb --- /dev/null +++ b/arch/arm64/kvm/stacktrace.c @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * + * Copyright (C) 2022 Google LLC + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/stacktrace/nvhe.h> + +/* + * kvm_nvhe_stack_kern_va - Convert KVM nVHE HYP stack addresses to a kernel VAs + * + * The nVHE hypervisor stack is mapped in the flexible 'private' VA range, to + * allow for guard pages below the stack. Consequently, the fixed offset address + * translation macros won't work here. + * + * The kernel VA is calculated as an offset from the kernel VA of the hypervisor + * stack base. + * + * Returns true on success and updates @addr to its corresponding kernel VA; + * otherwise returns false. + */ +static bool kvm_nvhe_stack_kern_va(unsigned long *addr, + enum stack_type type) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + unsigned long hyp_base, kern_base, hyp_offset; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + switch (type) { + case STACK_TYPE_HYP: + kern_base = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); + hyp_base = (unsigned long)stacktrace_info->stack_base; + break; + case STACK_TYPE_OVERFLOW: + kern_base = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack); + hyp_base = (unsigned long)stacktrace_info->overflow_stack_base; + break; + default: + return false; + } + + hyp_offset = *addr - hyp_base; + + *addr = kern_base + hyp_offset; + + return true; +} + +static bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base; + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); +} + +static bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->stack_base; + unsigned long high = low + PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); +} + +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + return (on_overflow_stack(sp, size, info) || + on_hyp_stack(sp, size, info)); +} + +static int unwind_next(struct unwind_state *state) +{ + struct stack_info info; + + return unwind_next_common(state, &info, on_accessible_stack, + kvm_nvhe_stack_kern_va); +} + +static void unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + +/* + * kvm_nvhe_dump_backtrace_entry - Symbolize and print an nVHE backtrace entry + * + * @arg : the hypervisor offset, used for address translation + * @where : the program counter corresponding to the stack frame + */ +static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); + unsigned long hyp_offset = (unsigned long)arg; + + /* Mask tags and convert to kern addr */ + where = (where & va_mask) + hyp_offset; + kvm_err(" [<%016lx>] %pB\n", where, (void *)(where + kaslr_offset())); + + return true; +} + +static void kvm_nvhe_dump_backtrace_start(void) +{ + kvm_err("nVHE call trace:\n"); +} + +static void kvm_nvhe_dump_backtrace_end(void) +{ + kvm_err("---[ end nVHE call trace ]---\n"); +} + +/* + * hyp_dump_backtrace - Dump the non-protected nVHE backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * The host can directly access HYP stack pages in non-protected + * mode, so the unwinding is done directly from EL1. This removes + * the need for shared buffers between host and hypervisor for + * the stacktrace. + */ +static void hyp_dump_backtrace(unsigned long hyp_offset) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + struct unwind_state state; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + kvm_nvhe_unwind_init(&state, stacktrace_info->fp, stacktrace_info->pc); + + kvm_nvhe_dump_backtrace_start(); + unwind(&state, kvm_nvhe_dump_backtrace_entry, (void *)hyp_offset); + kvm_nvhe_dump_backtrace_end(); +} + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], + pkvm_stacktrace); + +/* + * pkvm_dump_backtrace - Dump the protected nVHE HYP backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * Dumping of the pKVM HYP backtrace is done by reading the + * stack addresses from the shared stacktrace buffer, since the + * host cannot directly access hypervisor memory in protected + * mode. + */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + unsigned long *stacktrace + = (unsigned long *) this_cpu_ptr_nvhe_sym(pkvm_stacktrace); + int i; + + kvm_nvhe_dump_backtrace_start(); + /* The saved stacktrace is terminated by a null entry */ + for (i = 0; + i < ARRAY_SIZE(kvm_nvhe_sym(pkvm_stacktrace)) && stacktrace[i]; + i++) + kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]); + kvm_nvhe_dump_backtrace_end(); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n"); +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +/* + * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + */ +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset) +{ + if (is_protected_kvm_enabled()) + pkvm_dump_backtrace(hyp_offset); + else + hyp_dump_backtrace(hyp_offset); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c4fb3874b5e2..c059b259aea6 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -34,18 +34,11 @@ #include "trace.h" /* - * All of this file is extremely similar to the ARM coproc.c, but the - * types are different. My gut feeling is that it should be pretty - * easy to merge, but that would be an ABI breakage -- again. VFP - * would also need to be abstracted. - * * For AArch32, we only take care of what is being trapped. Anything * that has to do with init and userspace access has to go via the * 64bit interface. */ -static int reg_from_user(u64 *val, const void __user *uaddr, u64 id); -static int reg_to_user(void __user *uaddr, const u64 *val, u64 id); static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static bool read_from_write_only(struct kvm_vcpu *vcpu, @@ -72,7 +65,7 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { u64 val = 0x8badf00d8badf00d; - if (vcpu->arch.sysregs_loaded_on_cpu && + if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) && __vcpu_read_sys_reg_from_cpu(reg, &val)) return val; @@ -81,7 +74,7 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { - if (vcpu->arch.sysregs_loaded_on_cpu && + if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) && __vcpu_write_sys_reg_to_cpu(val, reg)) return; @@ -321,16 +314,8 @@ static bool trap_oslsr_el1(struct kvm_vcpu *vcpu, } static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - u64 id = sys_reg_to_index(rd); - u64 val; - int err; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; - /* * The only modifiable bit is the OSLK bit. Refuse the write if * userspace attempts to change any other bit in the register. @@ -387,7 +372,7 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, { if (p->is_write) { vcpu_write_sys_reg(vcpu, p->regval, r->reg); - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); } else { p->regval = vcpu_read_sys_reg(vcpu, r->reg); } @@ -403,8 +388,8 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, * A 32 bit write to a debug register leave top bits alone * A 32 bit read from a debug register only returns the bottom bits * - * All writes will set the KVM_ARM64_DEBUG_DIRTY flag to ensure the - * hyp.S code switches between host and guest values in future. + * All writes will set the DEBUG_DIRTY flag to ensure the hyp code + * switches between host and guest values in future. */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -420,7 +405,7 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu, val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); } static void dbg_to_reg(struct kvm_vcpu *vcpu, @@ -451,22 +436,16 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, } static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = val; return 0; } static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; return 0; } @@ -493,23 +472,16 @@ static bool trap_bcr(struct kvm_vcpu *vcpu, } static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; - + vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = val; return 0; } static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; return 0; } @@ -537,22 +509,16 @@ static bool trap_wvr(struct kvm_vcpu *vcpu, } static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = val; return 0; } static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; return 0; } @@ -579,22 +545,16 @@ static bool trap_wcr(struct kvm_vcpu *vcpu, } static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = val; return 0; } static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; return 0; } @@ -1227,16 +1187,9 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - const u64 id = sys_reg_to_index(rd); u8 csv2, csv3; - int err; - u64 val; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; /* * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as @@ -1262,7 +1215,7 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, return -EINVAL; vcpu->kvm->arch.pfr0_csv2 = csv2; - vcpu->kvm->arch.pfr0_csv3 = csv3 ; + vcpu->kvm->arch.pfr0_csv3 = csv3; return 0; } @@ -1275,27 +1228,17 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, * to be changed. */ static int __get_id_reg(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, void __user *uaddr, + const struct sys_reg_desc *rd, u64 *val, bool raz) { - const u64 id = sys_reg_to_index(rd); - const u64 val = read_id_reg(vcpu, rd, raz); - - return reg_to_user(uaddr, &val, id); + *val = read_id_reg(vcpu, rd, raz); + return 0; } static int __set_id_reg(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, void __user *uaddr, + const struct sys_reg_desc *rd, u64 val, bool raz) { - const u64 id = sys_reg_to_index(rd); - int err; - u64 val; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; - /* This is what we mean by invariant: you can't change it. */ if (val != read_id_reg(vcpu, rd, raz)) return -EINVAL; @@ -1304,47 +1247,37 @@ static int __set_id_reg(const struct kvm_vcpu *vcpu, } static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { bool raz = sysreg_visible_as_raz(vcpu, rd); - return __get_id_reg(vcpu, rd, uaddr, raz); + return __get_id_reg(vcpu, rd, val, raz); } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { bool raz = sysreg_visible_as_raz(vcpu, rd); - return __set_id_reg(vcpu, rd, uaddr, raz); + return __set_id_reg(vcpu, rd, val, raz); } static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - return __set_id_reg(vcpu, rd, uaddr, true); + return __set_id_reg(vcpu, rd, val, true); } static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - const u64 id = sys_reg_to_index(rd); - const u64 val = 0; - - return reg_to_user(uaddr, &val, id); + *val = 0; + return 0; } static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - int err; - u64 val; - - /* Perform the access even if we are going to ignore the value */ - err = reg_from_user(&val, uaddr, sys_reg_to_index(rd)); - if (err) - return err; - return 0; } @@ -2639,35 +2572,34 @@ static bool index_to_params(u64 id, struct sys_reg_params *params) } } -const struct sys_reg_desc *find_reg_by_id(u64 id, - struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num) +const struct sys_reg_desc *get_reg_by_id(u64 id, + const struct sys_reg_desc table[], + unsigned int num) { - if (!index_to_params(id, params)) + struct sys_reg_params params; + + if (!index_to_params(id, ¶ms)) return NULL; - return find_reg(params, table, num); + return find_reg(¶ms, table, num); } /* Decode an index value, and find the sys_reg_desc entry. */ -static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, - u64 id) +static const struct sys_reg_desc * +id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, + const struct sys_reg_desc table[], unsigned int num) + { const struct sys_reg_desc *r; - struct sys_reg_params params; /* We only do sys_reg for now. */ if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) return NULL; - if (!index_to_params(id, ¶ms)) - return NULL; - - r = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + r = get_reg_by_id(id, table, num); /* Not saved in the sys_reg array and not otherwise accessible? */ - if (r && !(r->reg || r->get_user)) + if (r && (!(r->reg || r->get_user) || sysreg_hidden(vcpu, r))) r = NULL; return r; @@ -2707,48 +2639,30 @@ static struct sys_reg_desc invariant_sys_regs[] = { { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 }, }; -static int reg_from_user(u64 *val, const void __user *uaddr, u64 id) -{ - if (copy_from_user(val, uaddr, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -static int reg_to_user(void __user *uaddr, const u64 *val, u64 id) -{ - if (copy_to_user(uaddr, val, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -static int get_invariant_sys_reg(u64 id, void __user *uaddr) +static int get_invariant_sys_reg(u64 id, u64 __user *uaddr) { - struct sys_reg_params params; const struct sys_reg_desc *r; - r = find_reg_by_id(id, ¶ms, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); + r = get_reg_by_id(id, invariant_sys_regs, + ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; - return reg_to_user(uaddr, &r->val, id); + return put_user(r->val, uaddr); } -static int set_invariant_sys_reg(u64 id, void __user *uaddr) +static int set_invariant_sys_reg(u64 id, u64 __user *uaddr) { - struct sys_reg_params params; const struct sys_reg_desc *r; - int err; - u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */ + u64 val; - r = find_reg_by_id(id, ¶ms, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); + r = get_reg_by_id(id, invariant_sys_regs, + ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; - err = reg_from_user(&val, uaddr, id); - if (err) - return err; + if (get_user(val, uaddr)) + return -EFAULT; /* This is what we mean by invariant: you can't change it. */ if (r->val != val) @@ -2839,54 +2753,86 @@ static int demux_c15_set(u64 id, void __user *uaddr) } } -int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num) { + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; + u64 val; + int ret; + + r = id_to_sys_reg_desc(vcpu, reg->id, table, num); + if (!r) + return -ENOENT; + + if (r->get_user) { + ret = (r->get_user)(vcpu, r, &val); + } else { + val = __vcpu_sys_reg(vcpu, r->reg); + ret = 0; + } + + if (!ret) + ret = put_user(val, uaddr); + + return ret; +} + +int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) +{ void __user *uaddr = (void __user *)(unsigned long)reg->addr; + int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_get(reg->id, uaddr); - if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) - return -ENOENT; + err = get_invariant_sys_reg(reg->id, uaddr); + if (err != -ENOENT) + return err; - r = index_to_sys_reg_desc(vcpu, reg->id); - if (!r) - return get_invariant_sys_reg(reg->id, uaddr); + return kvm_sys_reg_get_user(vcpu, reg, + sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); +} - /* Check for regs disabled by runtime config */ - if (sysreg_hidden(vcpu, r)) +int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; + const struct sys_reg_desc *r; + u64 val; + int ret; + + if (get_user(val, uaddr)) + return -EFAULT; + + r = id_to_sys_reg_desc(vcpu, reg->id, table, num); + if (!r) return -ENOENT; - if (r->get_user) - return (r->get_user)(vcpu, r, reg, uaddr); + if (r->set_user) { + ret = (r->set_user)(vcpu, r, val); + } else { + __vcpu_sys_reg(vcpu, r->reg) = val; + ret = 0; + } - return reg_to_user(uaddr, &__vcpu_sys_reg(vcpu, r->reg), reg->id); + return ret; } int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { - const struct sys_reg_desc *r; void __user *uaddr = (void __user *)(unsigned long)reg->addr; + int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_set(reg->id, uaddr); - if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) - return -ENOENT; - - r = index_to_sys_reg_desc(vcpu, reg->id); - if (!r) - return set_invariant_sys_reg(reg->id, uaddr); - - /* Check for regs disabled by runtime config */ - if (sysreg_hidden(vcpu, r)) - return -ENOENT; - - if (r->set_user) - return (r->set_user)(vcpu, r, reg, uaddr); + err = set_invariant_sys_reg(reg->id, uaddr); + if (err != -ENOENT) + return err; - return reg_from_user(&__vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id); + return kvm_sys_reg_set_user(vcpu, reg, + sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } static unsigned int num_demux_regs(void) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index aee8ea054f0d..a8c4cc32eb9a 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -75,9 +75,9 @@ struct sys_reg_desc { /* Custom get/set_user functions, fallback to generic if NULL */ int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr); + u64 *val); int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr); + u64 val); /* Return mask of REG_* runtime visibility overrides */ unsigned int (*visibility)(const struct kvm_vcpu *vcpu, @@ -190,10 +190,16 @@ find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } -const struct sys_reg_desc *find_reg_by_id(u64 id, - struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num); +const struct sys_reg_desc *get_reg_by_id(u64 id, + const struct sys_reg_desc table[], + unsigned int num); + +int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num); +int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num); #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 07d5271e9f05..9e7c486b48c2 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -10,293 +10,357 @@ #include "vgic/vgic.h" #include "sys_regs.h" -static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { u32 host_pri_bits, host_id_bits, host_seis, host_a3v, seis, a3v; struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + + /* + * Disallow restoring VM state if not supported by this + * hardware. + */ + host_pri_bits = FIELD_GET(ICC_CTLR_EL1_PRI_BITS_MASK, val) + 1; + if (host_pri_bits > vgic_v3_cpu->num_pri_bits) + return -EINVAL; + + vgic_v3_cpu->num_pri_bits = host_pri_bits; + + host_id_bits = FIELD_GET(ICC_CTLR_EL1_ID_BITS_MASK, val); + if (host_id_bits > vgic_v3_cpu->num_id_bits) + return -EINVAL; + + vgic_v3_cpu->num_id_bits = host_id_bits; + + host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2); + seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val); + if (host_seis != seis) + return -EINVAL; + + host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2); + a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val); + if (host_a3v != a3v) + return -EINVAL; + + /* + * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. + * The vgic_set_vmcr() will convert to ICH_VMCR layout. + */ + vmcr.cbpr = FIELD_GET(ICC_CTLR_EL1_CBPR_MASK, val); + vmcr.eoim = FIELD_GET(ICC_CTLR_EL1_EOImode_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *valp) +{ + struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; + struct vgic_vmcr vmcr; u64 val; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - val = p->regval; - - /* - * Disallow restoring VM state if not supported by this - * hardware. - */ - host_pri_bits = ((val & ICC_CTLR_EL1_PRI_BITS_MASK) >> - ICC_CTLR_EL1_PRI_BITS_SHIFT) + 1; - if (host_pri_bits > vgic_v3_cpu->num_pri_bits) - return false; - - vgic_v3_cpu->num_pri_bits = host_pri_bits; - - host_id_bits = (val & ICC_CTLR_EL1_ID_BITS_MASK) >> - ICC_CTLR_EL1_ID_BITS_SHIFT; - if (host_id_bits > vgic_v3_cpu->num_id_bits) - return false; - - vgic_v3_cpu->num_id_bits = host_id_bits; - - host_seis = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT); - seis = (val & ICC_CTLR_EL1_SEIS_MASK) >> - ICC_CTLR_EL1_SEIS_SHIFT; - if (host_seis != seis) - return false; - - host_a3v = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT); - a3v = (val & ICC_CTLR_EL1_A3V_MASK) >> ICC_CTLR_EL1_A3V_SHIFT; - if (host_a3v != a3v) - return false; - - /* - * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. - * The vgic_set_vmcr() will convert to ICH_VMCR layout. - */ - vmcr.cbpr = (val & ICC_CTLR_EL1_CBPR_MASK) >> ICC_CTLR_EL1_CBPR_SHIFT; - vmcr.eoim = (val & ICC_CTLR_EL1_EOImode_MASK) >> ICC_CTLR_EL1_EOImode_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - val = 0; - val |= (vgic_v3_cpu->num_pri_bits - 1) << - ICC_CTLR_EL1_PRI_BITS_SHIFT; - val |= vgic_v3_cpu->num_id_bits << ICC_CTLR_EL1_ID_BITS_SHIFT; - val |= ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT) << - ICC_CTLR_EL1_SEIS_SHIFT; - val |= ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT) << - ICC_CTLR_EL1_A3V_SHIFT; - /* - * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. - * Extract it directly using ICC_CTLR_EL1 reg definitions. - */ - val |= (vmcr.cbpr << ICC_CTLR_EL1_CBPR_SHIFT) & ICC_CTLR_EL1_CBPR_MASK; - val |= (vmcr.eoim << ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK; - - p->regval = val; - } + val = 0; + val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1); + val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits); + val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK, + FIELD_GET(ICH_VTR_SEIS_MASK, + kvm_vgic_global_state.ich_vtr_el2)); + val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK, + FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2)); + /* + * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. + * Extract it directly using ICC_CTLR_EL1 reg definitions. + */ + val |= FIELD_PREP(ICC_CTLR_EL1_CBPR_MASK, vmcr.cbpr); + val |= FIELD_PREP(ICC_CTLR_EL1_EOImode_MASK, vmcr.eoim); + + *valp = val; - return true; + return 0; } -static bool access_gic_pmr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.pmr = (p->regval & ICC_PMR_EL1_MASK) >> ICC_PMR_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.pmr << ICC_PMR_EL1_SHIFT) & ICC_PMR_EL1_MASK; - } + vmcr.pmr = FIELD_GET(ICC_PMR_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); - return true; + return 0; } -static bool access_gic_bpr0(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.bpr = (p->regval & ICC_BPR0_EL1_MASK) >> - ICC_BPR0_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.bpr << ICC_BPR0_EL1_SHIFT) & - ICC_BPR0_EL1_MASK; - } + *val = FIELD_PREP(ICC_PMR_EL1_MASK, vmcr.pmr); - return true; + return 0; } -static bool access_gic_bpr1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { struct vgic_vmcr vmcr; - if (!p->is_write) - p->regval = 0; + vgic_get_vmcr(vcpu, &vmcr); + vmcr.bpr = FIELD_GET(ICC_BPR0_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (!vmcr.cbpr) { - if (p->is_write) { - vmcr.abpr = (p->regval & ICC_BPR1_EL1_MASK) >> - ICC_BPR1_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.abpr << ICC_BPR1_EL1_SHIFT) & - ICC_BPR1_EL1_MASK; - } - } else { - if (!p->is_write) - p->regval = min((vmcr.bpr + 1), 7U); - } + *val = FIELD_PREP(ICC_BPR0_EL1_MASK, vmcr.bpr); - return true; + return 0; } -static bool access_gic_grpen0(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.grpen0 = (p->regval & ICC_IGRPEN0_EL1_MASK) >> - ICC_IGRPEN0_EL1_SHIFT; + if (!vmcr.cbpr) { + vmcr.abpr = FIELD_GET(ICC_BPR1_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.grpen0 << ICC_IGRPEN0_EL1_SHIFT) & - ICC_IGRPEN0_EL1_MASK; } - return true; + return 0; } -static bool access_gic_grpen1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.grpen1 = (p->regval & ICC_IGRPEN1_EL1_MASK) >> - ICC_IGRPEN1_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.grpen1 << ICC_IGRPEN1_EL1_SHIFT) & - ICC_IGRPEN1_EL1_MASK; - } + if (!vmcr.cbpr) + *val = FIELD_PREP(ICC_BPR1_EL1_MASK, vmcr.abpr); + else + *val = min((vmcr.bpr + 1), 7U); + + + return 0; +} + +static int set_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + vmcr.grpen0 = FIELD_GET(ICC_IGRPEN0_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = FIELD_PREP(ICC_IGRPEN0_EL1_MASK, vmcr.grpen0); - return true; + return 0; } -static void vgic_v3_access_apr_reg(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, u8 apr, u8 idx) +static int set_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + vmcr.grpen1 = FIELD_GET(ICC_IGRPEN1_EL1_MASK, val); + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = FIELD_GET(ICC_IGRPEN1_EL1_MASK, vmcr.grpen1); + + return 0; +} + +static void set_apr_reg(struct kvm_vcpu *vcpu, u64 val, u8 apr, u8 idx) { struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - uint32_t *ap_reg; if (apr) - ap_reg = &vgicv3->vgic_ap1r[idx]; + vgicv3->vgic_ap1r[idx] = val; else - ap_reg = &vgicv3->vgic_ap0r[idx]; + vgicv3->vgic_ap0r[idx] = val; +} + +static u64 get_apr_reg(struct kvm_vcpu *vcpu, u8 apr, u8 idx) +{ + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - if (p->is_write) - *ap_reg = p->regval; + if (apr) + return vgicv3->vgic_ap1r[idx]; else - p->regval = *ap_reg; + return vgicv3->vgic_ap0r[idx]; +} + +static int set_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) + +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + set_apr_reg(vcpu, val, 0, idx); + return 0; } -static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r, u8 apr) +static int get_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { u8 idx = r->Op2 & 3; if (idx > vgic_v3_max_apr_idx(vcpu)) - goto err; + return -EINVAL; - vgic_v3_access_apr_reg(vcpu, p, apr, idx); - return true; -err: - if (!p->is_write) - p->regval = 0; + *val = get_apr_reg(vcpu, 0, idx); - return false; + return 0; } -static bool access_gic_ap0r(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) + +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + set_apr_reg(vcpu, val, 1, idx); + return 0; +} +static int get_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { - return access_gic_aprn(vcpu, p, r, 0); + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + *val = get_apr_reg(vcpu, 1, idx); + + return 0; } -static bool access_gic_ap1r(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { - return access_gic_aprn(vcpu, p, r, 1); + /* Validate SRE bit */ + if (!(val & ICC_SRE_EL1_SRE)) + return -EINVAL; + + return 0; } -static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - /* Validate SRE bit */ - if (p->is_write) { - if (!(p->regval & ICC_SRE_EL1_SRE)) - return false; - } else { - p->regval = vgicv3->vgic_sre; - } + *val = vgicv3->vgic_sre; - return true; + return 0; } + static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { - { SYS_DESC(SYS_ICC_PMR_EL1), access_gic_pmr }, - { SYS_DESC(SYS_ICC_BPR0_EL1), access_gic_bpr0 }, - { SYS_DESC(SYS_ICC_AP0R0_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP0R1_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP0R2_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP0R3_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP1R0_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_AP1R1_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_AP1R2_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_AP1R3_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_BPR1_EL1), access_gic_bpr1 }, - { SYS_DESC(SYS_ICC_CTLR_EL1), access_gic_ctlr }, - { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, - { SYS_DESC(SYS_ICC_IGRPEN0_EL1), access_gic_grpen0 }, - { SYS_DESC(SYS_ICC_IGRPEN1_EL1), access_gic_grpen1 }, + { SYS_DESC(SYS_ICC_PMR_EL1), + .set_user = set_gic_pmr, .get_user = get_gic_pmr, }, + { SYS_DESC(SYS_ICC_BPR0_EL1), + .set_user = set_gic_bpr0, .get_user = get_gic_bpr0, }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_BPR1_EL1), + .set_user = set_gic_bpr1, .get_user = get_gic_bpr1, }, + { SYS_DESC(SYS_ICC_CTLR_EL1), + .set_user = set_gic_ctlr, .get_user = get_gic_ctlr, }, + { SYS_DESC(SYS_ICC_SRE_EL1), + .set_user = set_gic_sre, .get_user = get_gic_sre, }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), + .set_user = set_gic_grpen0, .get_user = get_gic_grpen0, }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), + .set_user = set_gic_grpen1, .get_user = get_gic_grpen1, }, }; -int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg) +static u64 attr_to_id(u64 attr) { - struct sys_reg_params params; - u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64; - - params.regval = *reg; - params.is_write = is_write; + return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP1_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRN_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRM_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP2_MASK, attr)); +} - if (find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, - ARRAY_SIZE(gic_v3_icc_reg_descs))) +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + if (get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs))) return 0; return -ENXIO; } -int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg) +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr, + bool is_write) { - struct sys_reg_params params; - const struct sys_reg_desc *r; - u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64; + struct kvm_one_reg reg = { + .id = attr_to_id(attr->attr), + .addr = attr->addr, + }; if (is_write) - params.regval = *reg; - params.is_write = is_write; - - r = find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, - ARRAY_SIZE(gic_v3_icc_reg_descs)); - if (!r) - return -ENXIO; - - if (!r->access(vcpu, ¶ms, r)) - return -EINVAL; - - if (!is_write) - *reg = params.regval; - - return 0; + return kvm_sys_reg_set_user(vcpu, ®, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); + else + return kvm_sys_reg_get_user(vcpu, ®, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); } diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index c6d52a1fd9c8..edeac2380591 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -41,11 +41,42 @@ static int vgic_check_type(struct kvm *kvm, int type_needed) return 0; } +int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) +{ + struct vgic_dist *vgic = &kvm->arch.vgic; + int r; + + mutex_lock(&kvm->lock); + switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + if (!r) + r = vgic_check_iorange(kvm, vgic->vgic_dist_base, dev_addr->addr, + SZ_4K, KVM_VGIC_V2_DIST_SIZE); + if (!r) + vgic->vgic_dist_base = dev_addr->addr; + break; + case KVM_VGIC_V2_ADDR_TYPE_CPU: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + if (!r) + r = vgic_check_iorange(kvm, vgic->vgic_cpu_base, dev_addr->addr, + SZ_4K, KVM_VGIC_V2_CPU_SIZE); + if (!r) + vgic->vgic_cpu_base = dev_addr->addr; + break; + default: + r = -ENODEV; + } + + mutex_unlock(&kvm->lock); + + return r; +} + /** * kvm_vgic_addr - set or get vgic VM base addresses * @kvm: pointer to the vm struct - * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX - * @addr: pointer to address value + * @attr: pointer to the attribute being retrieved/updated * @write: if true set the address in the VM address space, if false read the * address * @@ -57,15 +88,22 @@ static int vgic_check_type(struct kvm *kvm, int type_needed) * overlapping regions in case of a virtual GICv3 here, since we don't know * the number of VCPUs yet, so we defer this check to map_resources(). */ -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) +static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool write) { - int r = 0; + u64 __user *uaddr = (u64 __user *)attr->addr; struct vgic_dist *vgic = &kvm->arch.vgic; phys_addr_t *addr_ptr, alignment, size; u64 undef_value = VGIC_ADDR_UNDEF; + u64 addr; + int r; + + /* Reading a redistributor region addr implies getting the index */ + if (write || attr->attr == KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION) + if (get_user(addr, uaddr)) + return -EFAULT; mutex_lock(&kvm->lock); - switch (type) { + switch (attr->attr) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_dist_base; @@ -91,7 +129,7 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) if (r) break; if (write) { - r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); + r = vgic_v3_set_redist_base(kvm, 0, addr, 0); goto out; } rdreg = list_first_entry_or_null(&vgic->rd_regions, @@ -111,14 +149,12 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) if (r) break; - index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK; + index = addr & KVM_VGIC_V3_RDIST_INDEX_MASK; if (write) { - gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK; - u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK) - >> KVM_VGIC_V3_RDIST_COUNT_SHIFT; - u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK) - >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT; + gpa_t base = addr & KVM_VGIC_V3_RDIST_BASE_MASK; + u32 count = FIELD_GET(KVM_VGIC_V3_RDIST_COUNT_MASK, addr); + u8 flags = FIELD_GET(KVM_VGIC_V3_RDIST_FLAGS_MASK, addr); if (!count || flags) r = -EINVAL; @@ -134,9 +170,9 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) goto out; } - *addr = index; - *addr |= rdreg->base; - *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; + addr = index; + addr |= rdreg->base; + addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; goto out; } default: @@ -147,15 +183,19 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) goto out; if (write) { - r = vgic_check_iorange(kvm, *addr_ptr, *addr, alignment, size); + r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size); if (!r) - *addr_ptr = *addr; + *addr_ptr = addr; } else { - *addr = *addr_ptr; + addr = *addr_ptr; } out: mutex_unlock(&kvm->lock); + + if (!r && !write) + r = put_user(addr, uaddr); + return r; } @@ -165,17 +205,9 @@ static int vgic_set_common_attr(struct kvm_device *dev, int r; switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, true); + case KVM_DEV_ARM_VGIC_GRP_ADDR: + r = kvm_vgic_addr(dev->kvm, attr, true); return (r == -ENODEV) ? -ENXIO : r; - } case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; u32 val; @@ -214,6 +246,24 @@ static int vgic_set_common_attr(struct kvm_device *dev, r = vgic_init(dev->kvm); mutex_unlock(&dev->kvm->lock); return r; + case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: + /* + * OK, this one isn't common at all, but we + * want to handle all control group attributes + * in a single place. + */ + if (vgic_check_type(dev->kvm, KVM_DEV_TYPE_ARM_VGIC_V3)) + return -ENXIO; + mutex_lock(&dev->kvm->lock); + + if (!lock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + r = vgic_v3_save_pending_tables(dev->kvm); + unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return r; } break; } @@ -228,22 +278,9 @@ static int vgic_get_common_attr(struct kvm_device *dev, int r = -ENXIO; switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, false); - if (r) - return (r == -ENODEV) ? -ENXIO : r; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - } + case KVM_DEV_ARM_VGIC_GRP_ADDR: + r = kvm_vgic_addr(dev->kvm, attr, false); + return (r == -ENODEV) ? -ENXIO : r; case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; @@ -348,17 +385,18 @@ bool lock_all_vcpus(struct kvm *kvm) * * @dev: kvm device handle * @attr: kvm device attribute - * @reg: address the value is read or written * @is_write: true if userspace is writing a register */ static int vgic_v2_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, - u32 *reg, bool is_write) + bool is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; int ret; + u32 val; ret = vgic_v2_parse_attr(dev, attr, ®_attr); if (ret) @@ -367,6 +405,10 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, vcpu = reg_attr.vcpu; addr = reg_attr.addr; + if (is_write) + if (get_user(val, uaddr)) + return -EFAULT; + mutex_lock(&dev->kvm->lock); ret = vgic_init(dev->kvm); @@ -380,10 +422,10 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); + ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg); + ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, &val); break; default: ret = -EINVAL; @@ -393,57 +435,35 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, unlock_all_vcpus(dev->kvm); out: mutex_unlock(&dev->kvm->lock); + + if (!ret && !is_write) + ret = put_user(val, uaddr); + return ret; } static int vgic_v2_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v2_attr_regs_access(dev, attr, ®, true); - } + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_attr_regs_access(dev, attr, true); + default: + return vgic_set_common_attr(dev, attr); } - - return -ENXIO; } static int vgic_v2_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg = 0; - - ret = vgic_v2_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_attr_regs_access(dev, attr, false); + default: + return vgic_get_common_attr(dev, attr); } - - return -ENXIO; } static int vgic_v2_has_attr(struct kvm_device *dev, @@ -512,18 +532,18 @@ int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, * * @dev: kvm device handle * @attr: kvm device attribute - * @reg: address the value is read or written * @is_write: true if userspace is writing a register */ static int vgic_v3_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, - u64 *reg, bool is_write) + bool is_write) { struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; + bool uaccess; + u32 val; int ret; - u32 tmp32; ret = vgic_v3_parse_attr(dev, attr, ®_attr); if (ret) @@ -532,6 +552,21 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, vcpu = reg_attr.vcpu; addr = reg_attr.addr; + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + /* Sysregs uaccess is performed by the sysreg handling code */ + uaccess = false; + break; + default: + uaccess = true; + } + + if (uaccess && is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; + if (get_user(val, uaddr)) + return -EFAULT; + } + mutex_lock(&dev->kvm->lock); if (unlikely(!vgic_initialized(dev->kvm))) { @@ -546,29 +581,14 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; + ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; + ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &val); break; - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 regid; - - regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write, - regid, reg); + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + ret = vgic_v3_cpu_sysregs_uaccess(vcpu, attr, is_write); break; - } case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { unsigned int info, intid; @@ -578,7 +598,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, intid = attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, - intid, reg); + intid, &val); } else { ret = -EINVAL; } @@ -592,117 +612,41 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, unlock_all_vcpus(dev->kvm); out: mutex_unlock(&dev->kvm->lock); + + if (!ret && uaccess && !is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; + ret = put_user(val, uaddr); + } + return ret; } static int vgic_v3_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 tmp32; - u64 reg; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - int ret; - - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: - mutex_lock(&dev->kvm->lock); - - if (!lock_all_vcpus(dev->kvm)) { - mutex_unlock(&dev->kvm->lock); - return -EBUSY; - } - ret = vgic_v3_save_pending_tables(dev->kvm); - unlock_all_vcpus(dev->kvm); - mutex_unlock(&dev->kvm->lock); - return ret; - } - break; - } + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + return vgic_v3_attr_regs_access(dev, attr, true); + default: + return vgic_set_common_attr(dev, attr); } - return -ENXIO; } static int vgic_v3_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); - } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); - } + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + return vgic_v3_attr_regs_access(dev, attr, false); + default: + return vgic_get_common_attr(dev, attr); } - return -ENXIO; } static int vgic_v3_has_attr(struct kvm_device *dev, diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index f15e29cc63ce..91201f743033 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -986,12 +986,8 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) iodev.base_addr = 0; break; } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 reg, id; - - id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, ®); - } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + return vgic_v3_has_cpu_sysregs_attr(vcpu, attr); default: return -ENXIO; } @@ -1158,7 +1154,7 @@ int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, } int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val) + u32 intid, u32 *val) { if (intid % 32) return -EINVAL; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index 997d0fce2088..b32d434c1d4a 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -775,10 +775,10 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, } } -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) +u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) { int i; - u64 val = 0; + u32 val = 0; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; for (i = 0; i < 32; i++) { @@ -798,7 +798,7 @@ u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) } void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val) + const u32 val) { int i; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h index 6082d4b66d39..5b490a4dfa5e 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.h +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -207,10 +207,10 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, bool is_write, int offset, u32 *val); -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); +u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val); + const u32 val); unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 4c6bdd321faa..0c8da72953f0 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -245,12 +245,11 @@ int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); -int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u64 id, u64 *val); -int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg); +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr, bool is_write); +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val); + u32 intid, u32 *val); int kvm_register_vgic_device(unsigned long type); void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 6d85655e7edf..17516afc389a 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -156,6 +156,18 @@ (_AC(1, UL) << IRQ_S_TIMER) | \ (_AC(1, UL) << IRQ_S_EXT)) +/* xENVCFG flags */ +#define ENVCFG_STCE (_AC(1, ULL) << 63) +#define ENVCFG_PBMTE (_AC(1, ULL) << 62) +#define ENVCFG_CBZE (_AC(1, UL) << 7) +#define ENVCFG_CBCFE (_AC(1, UL) << 6) +#define ENVCFG_CBIE_SHIFT 4 +#define ENVCFG_CBIE (_AC(0x3, UL) << ENVCFG_CBIE_SHIFT) +#define ENVCFG_CBIE_ILL _AC(0x0, UL) +#define ENVCFG_CBIE_FLUSH _AC(0x1, UL) +#define ENVCFG_CBIE_INV _AC(0x3, UL) +#define ENVCFG_FIOM _AC(0x1, UL) + /* symbolic CSR names: */ #define CSR_CYCLE 0xc00 #define CSR_TIME 0xc01 @@ -252,7 +264,9 @@ #define CSR_HTIMEDELTA 0x605 #define CSR_HCOUNTEREN 0x606 #define CSR_HGEIE 0x607 +#define CSR_HENVCFG 0x60a #define CSR_HTIMEDELTAH 0x615 +#define CSR_HENVCFGH 0x61a #define CSR_HTVAL 0x643 #define CSR_HIP 0x644 #define CSR_HVIP 0x645 @@ -264,6 +278,8 @@ #define CSR_MISA 0x301 #define CSR_MIE 0x304 #define CSR_MTVEC 0x305 +#define CSR_MENVCFG 0x30a +#define CSR_MENVCFGH 0x31a #define CSR_MSCRATCH 0x340 #define CSR_MEPC 0x341 #define CSR_MCAUSE 0x342 diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 319c8aeb42af..60c517e4d576 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -14,7 +14,9 @@ #include <linux/kvm_types.h> #include <linux/spinlock.h> #include <asm/csr.h> +#include <asm/hwcap.h> #include <asm/kvm_vcpu_fp.h> +#include <asm/kvm_vcpu_insn.h> #include <asm/kvm_vcpu_timer.h> #define KVM_MAX_VCPUS 1024 @@ -63,6 +65,8 @@ struct kvm_vcpu_stat { u64 wfi_exit_stat; u64 mmio_exit_user; u64 mmio_exit_kernel; + u64 csr_exit_user; + u64 csr_exit_kernel; u64 exits; }; @@ -90,14 +94,6 @@ struct kvm_arch { struct kvm_guest_timer timer; }; -struct kvm_mmio_decode { - unsigned long insn; - int insn_len; - int len; - int shift; - int return_handled; -}; - struct kvm_sbi_context { int return_handled; }; @@ -170,7 +166,7 @@ struct kvm_vcpu_arch { int last_exit_cpu; /* ISA feature bits (similar to MISA) */ - unsigned long isa; + DECLARE_BITMAP(isa, RISCV_ISA_EXT_MAX); /* SSCRATCH, STVEC, and SCOUNTEREN of Host */ unsigned long host_sscratch; @@ -216,6 +212,9 @@ struct kvm_vcpu_arch { /* MMIO instruction details */ struct kvm_mmio_decode mmio_decode; + /* CSR instruction details */ + struct kvm_csr_decode csr_decode; + /* SBI context */ struct kvm_sbi_context sbi_context; @@ -285,6 +284,11 @@ void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, void kvm_riscv_hfence_vvma_all(struct kvm *kvm, unsigned long hbase, unsigned long hmask); +int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, + phys_addr_t hpa, unsigned long size, + bool writable, bool in_atomic); +void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, + unsigned long size); int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gpa_t gpa, unsigned long hva, bool is_write); @@ -303,14 +307,12 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu); void __kvm_riscv_unpriv_trap(void); -void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu); unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu, bool read_insn, unsigned long guest_addr, struct kvm_cpu_trap *trap); void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu, struct kvm_cpu_trap *trap); -int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_trap *trap); diff --git a/arch/riscv/include/asm/kvm_vcpu_fp.h b/arch/riscv/include/asm/kvm_vcpu_fp.h index 4da9b8e0f050..b5540147409f 100644 --- a/arch/riscv/include/asm/kvm_vcpu_fp.h +++ b/arch/riscv/include/asm/kvm_vcpu_fp.h @@ -22,9 +22,9 @@ void __kvm_riscv_fp_d_restore(struct kvm_cpu_context *context); void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_guest_fp_save(struct kvm_cpu_context *cntx, - unsigned long isa); + const unsigned long *isa); void kvm_riscv_vcpu_guest_fp_restore(struct kvm_cpu_context *cntx, - unsigned long isa); + const unsigned long *isa); void kvm_riscv_vcpu_host_fp_save(struct kvm_cpu_context *cntx); void kvm_riscv_vcpu_host_fp_restore(struct kvm_cpu_context *cntx); #else @@ -32,12 +32,12 @@ static inline void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu) { } static inline void kvm_riscv_vcpu_guest_fp_save(struct kvm_cpu_context *cntx, - unsigned long isa) + const unsigned long *isa) { } static inline void kvm_riscv_vcpu_guest_fp_restore( struct kvm_cpu_context *cntx, - unsigned long isa) + const unsigned long *isa) { } static inline void kvm_riscv_vcpu_host_fp_save(struct kvm_cpu_context *cntx) diff --git a/arch/riscv/include/asm/kvm_vcpu_insn.h b/arch/riscv/include/asm/kvm_vcpu_insn.h new file mode 100644 index 000000000000..350011c83581 --- /dev/null +++ b/arch/riscv/include/asm/kvm_vcpu_insn.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#ifndef __KVM_VCPU_RISCV_INSN_H +#define __KVM_VCPU_RISCV_INSN_H + +struct kvm_vcpu; +struct kvm_run; +struct kvm_cpu_trap; + +struct kvm_mmio_decode { + unsigned long insn; + int insn_len; + int len; + int shift; + int return_handled; +}; + +struct kvm_csr_decode { + unsigned long insn; + int return_handled; +}; + +/* Return values used by function emulating a particular instruction */ +enum kvm_insn_return { + KVM_INSN_EXIT_TO_USER_SPACE = 0, + KVM_INSN_CONTINUE_NEXT_SEPC, + KVM_INSN_CONTINUE_SAME_SEPC, + KVM_INSN_ILLEGAL_TRAP, + KVM_INSN_VIRTUAL_TRAP +}; + +void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu); +int kvm_riscv_vcpu_csr_return(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_riscv_vcpu_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_cpu_trap *trap); + +int kvm_riscv_vcpu_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run, + unsigned long fault_addr, + unsigned long htinst); +int kvm_riscv_vcpu_mmio_store(struct kvm_vcpu *vcpu, struct kvm_run *run, + unsigned long fault_addr, + unsigned long htinst); +int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); + +#endif diff --git a/arch/riscv/include/asm/kvm_vcpu_timer.h b/arch/riscv/include/asm/kvm_vcpu_timer.h index 375281eb49e0..50138e2eb91b 100644 --- a/arch/riscv/include/asm/kvm_vcpu_timer.h +++ b/arch/riscv/include/asm/kvm_vcpu_timer.h @@ -39,6 +39,6 @@ int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu); int kvm_riscv_vcpu_timer_deinit(struct kvm_vcpu *vcpu); int kvm_riscv_vcpu_timer_reset(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu); -int kvm_riscv_guest_timer_init(struct kvm *kvm); +void kvm_riscv_guest_timer_init(struct kvm *kvm); #endif diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 6119368ba6d5..24b2a6e27698 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -96,6 +96,7 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_H, KVM_RISCV_ISA_EXT_I, KVM_RISCV_ISA_EXT_M, + KVM_RISCV_ISA_EXT_SVPBMT, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index e5c56182f48f..019df9208bdd 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -17,6 +17,7 @@ kvm-y += mmu.o kvm-y += vcpu.o kvm-y += vcpu_exit.o kvm-y += vcpu_fp.o +kvm-y += vcpu_insn.o kvm-y += vcpu_switch.o kvm-y += vcpu_sbi.o kvm-$(CONFIG_RISCV_SBI_V01) += vcpu_sbi_v01.o diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 9826073fbc67..3a35b2d95697 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -343,23 +343,24 @@ static void gstage_wp_memory_region(struct kvm *kvm, int slot) kvm_flush_remote_tlbs(kvm); } -static int gstage_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, - unsigned long size, bool writable) +int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, + phys_addr_t hpa, unsigned long size, + bool writable, bool in_atomic) { pte_t pte; int ret = 0; unsigned long pfn; phys_addr_t addr, end; - struct kvm_mmu_memory_cache pcache; - - memset(&pcache, 0, sizeof(pcache)); - pcache.gfp_zero = __GFP_ZERO; + struct kvm_mmu_memory_cache pcache = { + .gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0, + .gfp_zero = __GFP_ZERO, + }; end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(hpa); for (addr = gpa; addr < end; addr += PAGE_SIZE) { - pte = pfn_pte(pfn, PAGE_KERNEL); + pte = pfn_pte(pfn, PAGE_KERNEL_IO); if (!writable) pte = pte_wrprotect(pte); @@ -382,6 +383,13 @@ out: return ret; } +void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size) +{ + spin_lock(&kvm->mmu_lock); + gstage_unmap_range(kvm, gpa, size, false); + spin_unlock(&kvm->mmu_lock); +} + void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, @@ -517,8 +525,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, goto out; } - ret = gstage_ioremap(kvm, gpa, pa, - vm_end - vm_start, writable); + ret = kvm_riscv_gstage_ioremap(kvm, gpa, pa, + vm_end - vm_start, + writable, false); if (ret) break; } @@ -611,7 +620,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, { int ret; kvm_pfn_t hfn; - bool writeable; + bool writable; short vma_pageshift; gfn_t gfn = gpa >> PAGE_SHIFT; struct vm_area_struct *vma; @@ -659,7 +668,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, mmu_seq = kvm->mmu_notifier_seq; - hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writeable); + hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable); if (hfn == KVM_PFN_ERR_HWPOISON) { send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, vma_pageshift, current); @@ -673,14 +682,14 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, * for write faults. */ if (logging && !is_write) - writeable = false; + writable = false; spin_lock(&kvm->mmu_lock); if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (writeable) { + if (writable) { kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index f3455dc013fa..5d271b597613 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -26,6 +26,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, wfi_exit_stat), STATS_DESC_COUNTER(VCPU, mmio_exit_user), STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), + STATS_DESC_COUNTER(VCPU, csr_exit_user), + STATS_DESC_COUNTER(VCPU, csr_exit_kernel), STATS_DESC_COUNTER(VCPU, exits) }; @@ -38,16 +40,58 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -#define KVM_RISCV_ISA_DISABLE_ALLOWED (riscv_isa_extension_mask(d) | \ - riscv_isa_extension_mask(f)) +#define KVM_RISCV_BASE_ISA_MASK GENMASK(25, 0) -#define KVM_RISCV_ISA_DISABLE_NOT_ALLOWED (riscv_isa_extension_mask(a) | \ - riscv_isa_extension_mask(c) | \ - riscv_isa_extension_mask(i) | \ - riscv_isa_extension_mask(m)) +/* Mapping between KVM ISA Extension ID & Host ISA extension ID */ +static const unsigned long kvm_isa_ext_arr[] = { + RISCV_ISA_EXT_a, + RISCV_ISA_EXT_c, + RISCV_ISA_EXT_d, + RISCV_ISA_EXT_f, + RISCV_ISA_EXT_h, + RISCV_ISA_EXT_i, + RISCV_ISA_EXT_m, + RISCV_ISA_EXT_SVPBMT, +}; + +static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) +{ + unsigned long i; + + for (i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) { + if (kvm_isa_ext_arr[i] == base_ext) + return i; + } + + return KVM_RISCV_ISA_EXT_MAX; +} + +static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext) +{ + switch (ext) { + case KVM_RISCV_ISA_EXT_H: + return false; + default: + break; + } + + return true; +} + +static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) +{ + switch (ext) { + case KVM_RISCV_ISA_EXT_A: + case KVM_RISCV_ISA_EXT_C: + case KVM_RISCV_ISA_EXT_I: + case KVM_RISCV_ISA_EXT_M: + return false; + default: + break; + } -#define KVM_RISCV_ISA_ALLOWED (KVM_RISCV_ISA_DISABLE_ALLOWED | \ - KVM_RISCV_ISA_DISABLE_NOT_ALLOWED) + return true; +} static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) { @@ -99,13 +143,20 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *cntx; struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; + unsigned long host_isa, i; /* Mark this VCPU never ran */ vcpu->arch.ran_atleast_once = false; vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; + bitmap_zero(vcpu->arch.isa, RISCV_ISA_EXT_MAX); /* Setup ISA features available to VCPU */ - vcpu->arch.isa = riscv_isa_extension_base(NULL) & KVM_RISCV_ISA_ALLOWED; + for (i = 0; i < ARRAY_SIZE(kvm_isa_ext_arr); i++) { + host_isa = kvm_isa_ext_arr[i]; + if (__riscv_isa_extension_available(NULL, host_isa) && + kvm_riscv_vcpu_isa_enable_allowed(i)) + set_bit(host_isa, vcpu->arch.isa); + } /* Setup VCPU hfence queue */ spin_lock_init(&vcpu->arch.hfence_lock); @@ -199,7 +250,7 @@ static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu, switch (reg_num) { case KVM_REG_RISCV_CONFIG_REG(isa): - reg_val = vcpu->arch.isa; + reg_val = vcpu->arch.isa[0] & KVM_RISCV_BASE_ISA_MASK; break; default: return -EINVAL; @@ -219,7 +270,7 @@ static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_RISCV_CONFIG); - unsigned long reg_val; + unsigned long i, isa_ext, reg_val; if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) return -EINVAL; @@ -227,13 +278,32 @@ static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) return -EFAULT; + /* This ONE REG interface is only defined for single letter extensions */ + if (fls(reg_val) >= RISCV_ISA_EXT_BASE) + return -EINVAL; + switch (reg_num) { case KVM_REG_RISCV_CONFIG_REG(isa): if (!vcpu->arch.ran_atleast_once) { - /* Ignore the disable request for these extensions */ - vcpu->arch.isa = reg_val | KVM_RISCV_ISA_DISABLE_NOT_ALLOWED; - vcpu->arch.isa &= riscv_isa_extension_base(NULL); - vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED; + /* Ignore the enable/disable request for certain extensions */ + for (i = 0; i < RISCV_ISA_EXT_BASE; i++) { + isa_ext = kvm_riscv_vcpu_base2isa_ext(i); + if (isa_ext >= KVM_RISCV_ISA_EXT_MAX) { + reg_val &= ~BIT(i); + continue; + } + if (!kvm_riscv_vcpu_isa_enable_allowed(isa_ext)) + if (reg_val & BIT(i)) + reg_val &= ~BIT(i); + if (!kvm_riscv_vcpu_isa_disable_allowed(isa_ext)) + if (!(reg_val & BIT(i))) + reg_val |= BIT(i); + } + reg_val &= riscv_isa_extension_base(NULL); + /* Do not modify anything beyond single letter extensions */ + reg_val = (vcpu->arch.isa[0] & ~KVM_RISCV_BASE_ISA_MASK) | + (reg_val & KVM_RISCV_BASE_ISA_MASK); + vcpu->arch.isa[0] = reg_val; kvm_riscv_vcpu_fp_reset(vcpu); } else { return -EOPNOTSUPP; @@ -374,17 +444,6 @@ static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu, return 0; } -/* Mapping between KVM ISA Extension ID & Host ISA extension ID */ -static unsigned long kvm_isa_ext_arr[] = { - RISCV_ISA_EXT_a, - RISCV_ISA_EXT_c, - RISCV_ISA_EXT_d, - RISCV_ISA_EXT_f, - RISCV_ISA_EXT_h, - RISCV_ISA_EXT_i, - RISCV_ISA_EXT_m, -}; - static int kvm_riscv_vcpu_get_reg_isa_ext(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { @@ -399,11 +458,12 @@ static int kvm_riscv_vcpu_get_reg_isa_ext(struct kvm_vcpu *vcpu, if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) return -EINVAL; - if (reg_num >= KVM_RISCV_ISA_EXT_MAX || reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) + if (reg_num >= KVM_RISCV_ISA_EXT_MAX || + reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) return -EINVAL; host_isa_ext = kvm_isa_ext_arr[reg_num]; - if (__riscv_isa_extension_available(&vcpu->arch.isa, host_isa_ext)) + if (__riscv_isa_extension_available(vcpu->arch.isa, host_isa_ext)) reg_val = 1; /* Mark the given extension as available */ if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) @@ -422,12 +482,12 @@ static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu, KVM_REG_RISCV_ISA_EXT); unsigned long reg_val; unsigned long host_isa_ext; - unsigned long host_isa_ext_mask; if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) return -EINVAL; - if (reg_num >= KVM_RISCV_ISA_EXT_MAX || reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) + if (reg_num >= KVM_RISCV_ISA_EXT_MAX || + reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) return -EINVAL; if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) @@ -437,30 +497,19 @@ static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu, if (!__riscv_isa_extension_available(NULL, host_isa_ext)) return -EOPNOTSUPP; - if (host_isa_ext >= RISCV_ISA_EXT_BASE && - host_isa_ext < RISCV_ISA_EXT_MAX) { + if (!vcpu->arch.ran_atleast_once) { /* - * Multi-letter ISA extension. Currently there is no provision - * to enable/disable the multi-letter ISA extensions for guests. - * Return success if the request is to enable any ISA extension - * that is available in the hardware. - * Return -EOPNOTSUPP otherwise. + * All multi-letter extension and a few single letter + * extension can be disabled */ - if (!reg_val) - return -EOPNOTSUPP; + if (reg_val == 1 && + kvm_riscv_vcpu_isa_enable_allowed(reg_num)) + set_bit(host_isa_ext, vcpu->arch.isa); + else if (!reg_val && + kvm_riscv_vcpu_isa_disable_allowed(reg_num)) + clear_bit(host_isa_ext, vcpu->arch.isa); else - return 0; - } - - /* Single letter base ISA extension */ - if (!vcpu->arch.ran_atleast_once) { - host_isa_ext_mask = BIT_MASK(host_isa_ext); - if (!reg_val && (host_isa_ext_mask & KVM_RISCV_ISA_DISABLE_ALLOWED)) - vcpu->arch.isa &= ~host_isa_ext_mask; - else - vcpu->arch.isa |= host_isa_ext_mask; - vcpu->arch.isa &= riscv_isa_extension_base(NULL); - vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED; + return -EINVAL; kvm_riscv_vcpu_fp_reset(vcpu); } else { return -EOPNOTSUPP; @@ -729,6 +778,19 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -EINVAL; } +static void kvm_riscv_vcpu_update_config(const unsigned long *isa) +{ + u64 henvcfg = 0; + + if (__riscv_isa_extension_available(isa, RISCV_ISA_EXT_SVPBMT)) + henvcfg |= ENVCFG_PBMTE; + + csr_write(CSR_HENVCFG, henvcfg); +#ifdef CONFIG_32BIT + csr_write(CSR_HENVCFGH, henvcfg >> 32); +#endif +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; @@ -743,6 +805,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) csr_write(CSR_HVIP, csr->hvip); csr_write(CSR_VSATP, csr->vsatp); + kvm_riscv_vcpu_update_config(vcpu->arch.isa); + kvm_riscv_gstage_update_hgatp(vcpu); kvm_riscv_vcpu_timer_restore(vcpu); @@ -853,22 +917,26 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_vcpu_srcu_read_lock(vcpu); - /* Process MMIO value returned from user-space */ - if (run->exit_reason == KVM_EXIT_MMIO) { + switch (run->exit_reason) { + case KVM_EXIT_MMIO: + /* Process MMIO value returned from user-space */ ret = kvm_riscv_vcpu_mmio_return(vcpu, vcpu->run); - if (ret) { - kvm_vcpu_srcu_read_unlock(vcpu); - return ret; - } - } - - /* Process SBI value returned from user-space */ - if (run->exit_reason == KVM_EXIT_RISCV_SBI) { + break; + case KVM_EXIT_RISCV_SBI: + /* Process SBI value returned from user-space */ ret = kvm_riscv_vcpu_sbi_return(vcpu, vcpu->run); - if (ret) { - kvm_vcpu_srcu_read_unlock(vcpu); - return ret; - } + break; + case KVM_EXIT_RISCV_CSR: + /* Process CSR value returned from user-space */ + ret = kvm_riscv_vcpu_csr_return(vcpu, vcpu->run); + break; + default: + ret = 0; + break; + } + if (ret) { + kvm_vcpu_srcu_read_unlock(vcpu); + return ret; } if (run->immediate_exit) { @@ -890,8 +958,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_riscv_check_vcpu_requests(vcpu); - preempt_disable(); - local_irq_disable(); /* @@ -928,7 +994,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_request_pending(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; local_irq_enable(); - preempt_enable(); kvm_vcpu_srcu_read_lock(vcpu); continue; } @@ -962,6 +1027,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Syncup interrupts state with HW */ kvm_riscv_vcpu_sync_interrupts(vcpu); + preempt_disable(); + /* * We must ensure that any pending interrupts are taken before * we exit guest timing so that timer ticks are accounted as diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index dbb09afd7546..d5c36386878a 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -6,435 +6,34 @@ * Anup Patel <anup.patel@wdc.com> */ -#include <linux/bitops.h> -#include <linux/errno.h> -#include <linux/err.h> #include <linux/kvm_host.h> #include <asm/csr.h> -#define INSN_OPCODE_MASK 0x007c -#define INSN_OPCODE_SHIFT 2 -#define INSN_OPCODE_SYSTEM 28 - -#define INSN_MASK_WFI 0xffffffff -#define INSN_MATCH_WFI 0x10500073 - -#define INSN_MATCH_LB 0x3 -#define INSN_MASK_LB 0x707f -#define INSN_MATCH_LH 0x1003 -#define INSN_MASK_LH 0x707f -#define INSN_MATCH_LW 0x2003 -#define INSN_MASK_LW 0x707f -#define INSN_MATCH_LD 0x3003 -#define INSN_MASK_LD 0x707f -#define INSN_MATCH_LBU 0x4003 -#define INSN_MASK_LBU 0x707f -#define INSN_MATCH_LHU 0x5003 -#define INSN_MASK_LHU 0x707f -#define INSN_MATCH_LWU 0x6003 -#define INSN_MASK_LWU 0x707f -#define INSN_MATCH_SB 0x23 -#define INSN_MASK_SB 0x707f -#define INSN_MATCH_SH 0x1023 -#define INSN_MASK_SH 0x707f -#define INSN_MATCH_SW 0x2023 -#define INSN_MASK_SW 0x707f -#define INSN_MATCH_SD 0x3023 -#define INSN_MASK_SD 0x707f - -#define INSN_MATCH_C_LD 0x6000 -#define INSN_MASK_C_LD 0xe003 -#define INSN_MATCH_C_SD 0xe000 -#define INSN_MASK_C_SD 0xe003 -#define INSN_MATCH_C_LW 0x4000 -#define INSN_MASK_C_LW 0xe003 -#define INSN_MATCH_C_SW 0xc000 -#define INSN_MASK_C_SW 0xe003 -#define INSN_MATCH_C_LDSP 0x6002 -#define INSN_MASK_C_LDSP 0xe003 -#define INSN_MATCH_C_SDSP 0xe002 -#define INSN_MASK_C_SDSP 0xe003 -#define INSN_MATCH_C_LWSP 0x4002 -#define INSN_MASK_C_LWSP 0xe003 -#define INSN_MATCH_C_SWSP 0xc002 -#define INSN_MASK_C_SWSP 0xe003 - -#define INSN_16BIT_MASK 0x3 - -#define INSN_IS_16BIT(insn) (((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK) - -#define INSN_LEN(insn) (INSN_IS_16BIT(insn) ? 2 : 4) - -#ifdef CONFIG_64BIT -#define LOG_REGBYTES 3 -#else -#define LOG_REGBYTES 2 -#endif -#define REGBYTES (1 << LOG_REGBYTES) - -#define SH_RD 7 -#define SH_RS1 15 -#define SH_RS2 20 -#define SH_RS2C 2 - -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) -#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ - (RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 1) << 6)) -#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 5, 2) << 6)) -#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 2) << 6)) -#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ - (RV_X(x, 12, 1) << 5) | \ - (RV_X(x, 2, 3) << 6)) -#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ - (RV_X(x, 7, 2) << 6)) -#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ - (RV_X(x, 7, 3) << 6)) -#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) -#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) -#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) - -#define SHIFT_RIGHT(x, y) \ - ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) - -#define REG_MASK \ - ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) - -#define REG_OFFSET(insn, pos) \ - (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) - -#define REG_PTR(insn, pos, regs) \ - ((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))) - -#define GET_RM(insn) (((insn) >> 12) & 7) - -#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) -#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) -#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) -#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) -#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) -#define GET_SP(regs) (*REG_PTR(2, 0, regs)) -#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) -#define IMM_I(insn) ((s32)(insn) >> 20) -#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ - (s32)(((insn) >> 7) & 0x1f)) -#define MASK_FUNCT3 0x7000 - -static int truly_illegal_insn(struct kvm_vcpu *vcpu, - struct kvm_run *run, - ulong insn) -{ - struct kvm_cpu_trap utrap = { 0 }; - - /* Redirect trap to Guest VCPU */ - utrap.sepc = vcpu->arch.guest_context.sepc; - utrap.scause = EXC_INST_ILLEGAL; - utrap.stval = insn; - kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); - - return 1; -} - -static int system_opcode_insn(struct kvm_vcpu *vcpu, - struct kvm_run *run, - ulong insn) -{ - if ((insn & INSN_MASK_WFI) == INSN_MATCH_WFI) { - vcpu->stat.wfi_exit_stat++; - kvm_riscv_vcpu_wfi(vcpu); - vcpu->arch.guest_context.sepc += INSN_LEN(insn); - return 1; - } - - return truly_illegal_insn(vcpu, run, insn); -} - -static int virtual_inst_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, - struct kvm_cpu_trap *trap) -{ - unsigned long insn = trap->stval; - struct kvm_cpu_trap utrap = { 0 }; - struct kvm_cpu_context *ct; - - if (unlikely(INSN_IS_16BIT(insn))) { - if (insn == 0) { - ct = &vcpu->arch.guest_context; - insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, - ct->sepc, - &utrap); - if (utrap.scause) { - utrap.sepc = ct->sepc; - kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); - return 1; - } - } - if (INSN_IS_16BIT(insn)) - return truly_illegal_insn(vcpu, run, insn); - } - - switch ((insn & INSN_OPCODE_MASK) >> INSN_OPCODE_SHIFT) { - case INSN_OPCODE_SYSTEM: - return system_opcode_insn(vcpu, run, insn); - default: - return truly_illegal_insn(vcpu, run, insn); - } -} - -static int emulate_load(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long fault_addr, unsigned long htinst) -{ - u8 data_buf[8]; - unsigned long insn; - int shift = 0, len = 0, insn_len = 0; - struct kvm_cpu_trap utrap = { 0 }; - struct kvm_cpu_context *ct = &vcpu->arch.guest_context; - - /* Determine trapped instruction */ - if (htinst & 0x1) { - /* - * Bit[0] == 1 implies trapped instruction value is - * transformed instruction or custom instruction. - */ - insn = htinst | INSN_16BIT_MASK; - insn_len = (htinst & BIT(1)) ? INSN_LEN(insn) : 2; - } else { - /* - * Bit[0] == 0 implies trapped instruction value is - * zero or special value. - */ - insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, - &utrap); - if (utrap.scause) { - /* Redirect trap if we failed to read instruction */ - utrap.sepc = ct->sepc; - kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); - return 1; - } - insn_len = INSN_LEN(insn); - } - - /* Decode length of MMIO and shift */ - if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { - len = 4; - shift = 8 * (sizeof(ulong) - len); - } else if ((insn & INSN_MASK_LB) == INSN_MATCH_LB) { - len = 1; - shift = 8 * (sizeof(ulong) - len); - } else if ((insn & INSN_MASK_LBU) == INSN_MATCH_LBU) { - len = 1; - shift = 8 * (sizeof(ulong) - len); -#ifdef CONFIG_64BIT - } else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) { - len = 8; - shift = 8 * (sizeof(ulong) - len); - } else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) { - len = 4; -#endif - } else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) { - len = 2; - shift = 8 * (sizeof(ulong) - len); - } else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) { - len = 2; -#ifdef CONFIG_64BIT - } else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) { - len = 8; - shift = 8 * (sizeof(ulong) - len); - insn = RVC_RS2S(insn) << SH_RD; - } else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP && - ((insn >> SH_RD) & 0x1f)) { - len = 8; - shift = 8 * (sizeof(ulong) - len); -#endif - } else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) { - len = 4; - shift = 8 * (sizeof(ulong) - len); - insn = RVC_RS2S(insn) << SH_RD; - } else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP && - ((insn >> SH_RD) & 0x1f)) { - len = 4; - shift = 8 * (sizeof(ulong) - len); - } else { - return -EOPNOTSUPP; - } - - /* Fault address should be aligned to length of MMIO */ - if (fault_addr & (len - 1)) - return -EIO; - - /* Save instruction decode info */ - vcpu->arch.mmio_decode.insn = insn; - vcpu->arch.mmio_decode.insn_len = insn_len; - vcpu->arch.mmio_decode.shift = shift; - vcpu->arch.mmio_decode.len = len; - vcpu->arch.mmio_decode.return_handled = 0; - - /* Update MMIO details in kvm_run struct */ - run->mmio.is_write = false; - run->mmio.phys_addr = fault_addr; - run->mmio.len = len; - - /* Try to handle MMIO access in the kernel */ - if (!kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_addr, len, data_buf)) { - /* Successfully handled MMIO access in the kernel so resume */ - memcpy(run->mmio.data, data_buf, len); - vcpu->stat.mmio_exit_kernel++; - kvm_riscv_vcpu_mmio_return(vcpu, run); - return 1; - } - - /* Exit to userspace for MMIO emulation */ - vcpu->stat.mmio_exit_user++; - run->exit_reason = KVM_EXIT_MMIO; - - return 0; -} - -static int emulate_store(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long fault_addr, unsigned long htinst) -{ - u8 data8; - u16 data16; - u32 data32; - u64 data64; - ulong data; - unsigned long insn; - int len = 0, insn_len = 0; - struct kvm_cpu_trap utrap = { 0 }; - struct kvm_cpu_context *ct = &vcpu->arch.guest_context; - - /* Determine trapped instruction */ - if (htinst & 0x1) { - /* - * Bit[0] == 1 implies trapped instruction value is - * transformed instruction or custom instruction. - */ - insn = htinst | INSN_16BIT_MASK; - insn_len = (htinst & BIT(1)) ? INSN_LEN(insn) : 2; - } else { - /* - * Bit[0] == 0 implies trapped instruction value is - * zero or special value. - */ - insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, - &utrap); - if (utrap.scause) { - /* Redirect trap if we failed to read instruction */ - utrap.sepc = ct->sepc; - kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); - return 1; - } - insn_len = INSN_LEN(insn); - } - - data = GET_RS2(insn, &vcpu->arch.guest_context); - data8 = data16 = data32 = data64 = data; - - if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) { - len = 4; - } else if ((insn & INSN_MASK_SB) == INSN_MATCH_SB) { - len = 1; -#ifdef CONFIG_64BIT - } else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) { - len = 8; -#endif - } else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) { - len = 2; -#ifdef CONFIG_64BIT - } else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) { - len = 8; - data64 = GET_RS2S(insn, &vcpu->arch.guest_context); - } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP && - ((insn >> SH_RD) & 0x1f)) { - len = 8; - data64 = GET_RS2C(insn, &vcpu->arch.guest_context); -#endif - } else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) { - len = 4; - data32 = GET_RS2S(insn, &vcpu->arch.guest_context); - } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP && - ((insn >> SH_RD) & 0x1f)) { - len = 4; - data32 = GET_RS2C(insn, &vcpu->arch.guest_context); - } else { - return -EOPNOTSUPP; - } - - /* Fault address should be aligned to length of MMIO */ - if (fault_addr & (len - 1)) - return -EIO; - - /* Save instruction decode info */ - vcpu->arch.mmio_decode.insn = insn; - vcpu->arch.mmio_decode.insn_len = insn_len; - vcpu->arch.mmio_decode.shift = 0; - vcpu->arch.mmio_decode.len = len; - vcpu->arch.mmio_decode.return_handled = 0; - - /* Copy data to kvm_run instance */ - switch (len) { - case 1: - *((u8 *)run->mmio.data) = data8; - break; - case 2: - *((u16 *)run->mmio.data) = data16; - break; - case 4: - *((u32 *)run->mmio.data) = data32; - break; - case 8: - *((u64 *)run->mmio.data) = data64; - break; - default: - return -EOPNOTSUPP; - } - - /* Update MMIO details in kvm_run struct */ - run->mmio.is_write = true; - run->mmio.phys_addr = fault_addr; - run->mmio.len = len; - - /* Try to handle MMIO access in the kernel */ - if (!kvm_io_bus_write(vcpu, KVM_MMIO_BUS, - fault_addr, len, run->mmio.data)) { - /* Successfully handled MMIO access in the kernel so resume */ - vcpu->stat.mmio_exit_kernel++; - kvm_riscv_vcpu_mmio_return(vcpu, run); - return 1; - } - - /* Exit to userspace for MMIO emulation */ - vcpu->stat.mmio_exit_user++; - run->exit_reason = KVM_EXIT_MMIO; - - return 0; -} - static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_trap *trap) { struct kvm_memory_slot *memslot; unsigned long hva, fault_addr; - bool writeable; + bool writable; gfn_t gfn; int ret; fault_addr = (trap->htval << 2) | (trap->stval & 0x3); gfn = fault_addr >> PAGE_SHIFT; memslot = gfn_to_memslot(vcpu->kvm, gfn); - hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable); + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); if (kvm_is_error_hva(hva) || - (trap->scause == EXC_STORE_GUEST_PAGE_FAULT && !writeable)) { + (trap->scause == EXC_STORE_GUEST_PAGE_FAULT && !writable)) { switch (trap->scause) { case EXC_LOAD_GUEST_PAGE_FAULT: - return emulate_load(vcpu, run, fault_addr, - trap->htinst); + return kvm_riscv_vcpu_mmio_load(vcpu, run, + fault_addr, + trap->htinst); case EXC_STORE_GUEST_PAGE_FAULT: - return emulate_store(vcpu, run, fault_addr, - trap->htinst); + return kvm_riscv_vcpu_mmio_store(vcpu, run, + fault_addr, + trap->htinst); default: return -EOPNOTSUPP; }; @@ -449,21 +48,6 @@ static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, } /** - * kvm_riscv_vcpu_wfi -- Emulate wait for interrupt (WFI) behaviour - * - * @vcpu: The VCPU pointer - */ -void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu) -{ - if (!kvm_arch_vcpu_runnable(vcpu)) { - kvm_vcpu_srcu_read_unlock(vcpu); - kvm_vcpu_halt(vcpu); - kvm_vcpu_srcu_read_lock(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); - } -} - -/** * kvm_riscv_vcpu_unpriv_read -- Read machine word from Guest memory * * @vcpu: The VCPU pointer @@ -601,66 +185,6 @@ void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu, vcpu->arch.guest_context.sepc = csr_read(CSR_VSTVEC); } -/** - * kvm_riscv_vcpu_mmio_return -- Handle MMIO loads after user space emulation - * or in-kernel IO emulation - * - * @vcpu: The VCPU pointer - * @run: The VCPU run struct containing the mmio data - */ -int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - u8 data8; - u16 data16; - u32 data32; - u64 data64; - ulong insn; - int len, shift; - - if (vcpu->arch.mmio_decode.return_handled) - return 0; - - vcpu->arch.mmio_decode.return_handled = 1; - insn = vcpu->arch.mmio_decode.insn; - - if (run->mmio.is_write) - goto done; - - len = vcpu->arch.mmio_decode.len; - shift = vcpu->arch.mmio_decode.shift; - - switch (len) { - case 1: - data8 = *((u8 *)run->mmio.data); - SET_RD(insn, &vcpu->arch.guest_context, - (ulong)data8 << shift >> shift); - break; - case 2: - data16 = *((u16 *)run->mmio.data); - SET_RD(insn, &vcpu->arch.guest_context, - (ulong)data16 << shift >> shift); - break; - case 4: - data32 = *((u32 *)run->mmio.data); - SET_RD(insn, &vcpu->arch.guest_context, - (ulong)data32 << shift >> shift); - break; - case 8: - data64 = *((u64 *)run->mmio.data); - SET_RD(insn, &vcpu->arch.guest_context, - (ulong)data64 << shift >> shift); - break; - default: - return -EOPNOTSUPP; - } - -done: - /* Move to next instruction */ - vcpu->arch.guest_context.sepc += vcpu->arch.mmio_decode.insn_len; - - return 0; -} - /* * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on * proper exit to userspace. @@ -680,7 +204,7 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, switch (trap->scause) { case EXC_VIRTUAL_INST_FAULT: if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) - ret = virtual_inst_fault(vcpu, run, trap); + ret = kvm_riscv_vcpu_virtual_insn(vcpu, run, trap); break; case EXC_INST_GUEST_PAGE_FAULT: case EXC_LOAD_GUEST_PAGE_FAULT: diff --git a/arch/riscv/kvm/vcpu_fp.c b/arch/riscv/kvm/vcpu_fp.c index d4308c512007..9d8cbc42057a 100644 --- a/arch/riscv/kvm/vcpu_fp.c +++ b/arch/riscv/kvm/vcpu_fp.c @@ -16,12 +16,11 @@ #ifdef CONFIG_FPU void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu) { - unsigned long isa = vcpu->arch.isa; struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; cntx->sstatus &= ~SR_FS; - if (riscv_isa_extension_available(&isa, f) || - riscv_isa_extension_available(&isa, d)) + if (riscv_isa_extension_available(vcpu->arch.isa, f) || + riscv_isa_extension_available(vcpu->arch.isa, d)) cntx->sstatus |= SR_FS_INITIAL; else cntx->sstatus |= SR_FS_OFF; @@ -34,24 +33,24 @@ static void kvm_riscv_vcpu_fp_clean(struct kvm_cpu_context *cntx) } void kvm_riscv_vcpu_guest_fp_save(struct kvm_cpu_context *cntx, - unsigned long isa) + const unsigned long *isa) { if ((cntx->sstatus & SR_FS) == SR_FS_DIRTY) { - if (riscv_isa_extension_available(&isa, d)) + if (riscv_isa_extension_available(isa, d)) __kvm_riscv_fp_d_save(cntx); - else if (riscv_isa_extension_available(&isa, f)) + else if (riscv_isa_extension_available(isa, f)) __kvm_riscv_fp_f_save(cntx); kvm_riscv_vcpu_fp_clean(cntx); } } void kvm_riscv_vcpu_guest_fp_restore(struct kvm_cpu_context *cntx, - unsigned long isa) + const unsigned long *isa) { if ((cntx->sstatus & SR_FS) != SR_FS_OFF) { - if (riscv_isa_extension_available(&isa, d)) + if (riscv_isa_extension_available(isa, d)) __kvm_riscv_fp_d_restore(cntx); - else if (riscv_isa_extension_available(&isa, f)) + else if (riscv_isa_extension_available(isa, f)) __kvm_riscv_fp_f_restore(cntx); kvm_riscv_vcpu_fp_clean(cntx); } @@ -80,7 +79,6 @@ int kvm_riscv_vcpu_get_reg_fp(struct kvm_vcpu *vcpu, unsigned long rtype) { struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; - unsigned long isa = vcpu->arch.isa; unsigned long __user *uaddr = (unsigned long __user *)(unsigned long)reg->addr; unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | @@ -89,7 +87,7 @@ int kvm_riscv_vcpu_get_reg_fp(struct kvm_vcpu *vcpu, void *reg_val; if ((rtype == KVM_REG_RISCV_FP_F) && - riscv_isa_extension_available(&isa, f)) { + riscv_isa_extension_available(vcpu->arch.isa, f)) { if (KVM_REG_SIZE(reg->id) != sizeof(u32)) return -EINVAL; if (reg_num == KVM_REG_RISCV_FP_F_REG(fcsr)) @@ -100,7 +98,7 @@ int kvm_riscv_vcpu_get_reg_fp(struct kvm_vcpu *vcpu, else return -EINVAL; } else if ((rtype == KVM_REG_RISCV_FP_D) && - riscv_isa_extension_available(&isa, d)) { + riscv_isa_extension_available(vcpu->arch.isa, d)) { if (reg_num == KVM_REG_RISCV_FP_D_REG(fcsr)) { if (KVM_REG_SIZE(reg->id) != sizeof(u32)) return -EINVAL; @@ -126,7 +124,6 @@ int kvm_riscv_vcpu_set_reg_fp(struct kvm_vcpu *vcpu, unsigned long rtype) { struct kvm_cpu_context *cntx = &vcpu->arch.guest_context; - unsigned long isa = vcpu->arch.isa; unsigned long __user *uaddr = (unsigned long __user *)(unsigned long)reg->addr; unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | @@ -135,7 +132,7 @@ int kvm_riscv_vcpu_set_reg_fp(struct kvm_vcpu *vcpu, void *reg_val; if ((rtype == KVM_REG_RISCV_FP_F) && - riscv_isa_extension_available(&isa, f)) { + riscv_isa_extension_available(vcpu->arch.isa, f)) { if (KVM_REG_SIZE(reg->id) != sizeof(u32)) return -EINVAL; if (reg_num == KVM_REG_RISCV_FP_F_REG(fcsr)) @@ -146,7 +143,7 @@ int kvm_riscv_vcpu_set_reg_fp(struct kvm_vcpu *vcpu, else return -EINVAL; } else if ((rtype == KVM_REG_RISCV_FP_D) && - riscv_isa_extension_available(&isa, d)) { + riscv_isa_extension_available(vcpu->arch.isa, d)) { if (reg_num == KVM_REG_RISCV_FP_D_REG(fcsr)) { if (KVM_REG_SIZE(reg->id) != sizeof(u32)) return -EINVAL; diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c new file mode 100644 index 000000000000..7eb90a47b571 --- /dev/null +++ b/arch/riscv/kvm/vcpu_insn.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#include <linux/bitops.h> +#include <linux/kvm_host.h> + +#define INSN_OPCODE_MASK 0x007c +#define INSN_OPCODE_SHIFT 2 +#define INSN_OPCODE_SYSTEM 28 + +#define INSN_MASK_WFI 0xffffffff +#define INSN_MATCH_WFI 0x10500073 + +#define INSN_MATCH_CSRRW 0x1073 +#define INSN_MASK_CSRRW 0x707f +#define INSN_MATCH_CSRRS 0x2073 +#define INSN_MASK_CSRRS 0x707f +#define INSN_MATCH_CSRRC 0x3073 +#define INSN_MASK_CSRRC 0x707f +#define INSN_MATCH_CSRRWI 0x5073 +#define INSN_MASK_CSRRWI 0x707f +#define INSN_MATCH_CSRRSI 0x6073 +#define INSN_MASK_CSRRSI 0x707f +#define INSN_MATCH_CSRRCI 0x7073 +#define INSN_MASK_CSRRCI 0x707f + +#define INSN_MATCH_LB 0x3 +#define INSN_MASK_LB 0x707f +#define INSN_MATCH_LH 0x1003 +#define INSN_MASK_LH 0x707f +#define INSN_MATCH_LW 0x2003 +#define INSN_MASK_LW 0x707f +#define INSN_MATCH_LD 0x3003 +#define INSN_MASK_LD 0x707f +#define INSN_MATCH_LBU 0x4003 +#define INSN_MASK_LBU 0x707f +#define INSN_MATCH_LHU 0x5003 +#define INSN_MASK_LHU 0x707f +#define INSN_MATCH_LWU 0x6003 +#define INSN_MASK_LWU 0x707f +#define INSN_MATCH_SB 0x23 +#define INSN_MASK_SB 0x707f +#define INSN_MATCH_SH 0x1023 +#define INSN_MASK_SH 0x707f +#define INSN_MATCH_SW 0x2023 +#define INSN_MASK_SW 0x707f +#define INSN_MATCH_SD 0x3023 +#define INSN_MASK_SD 0x707f + +#define INSN_MATCH_C_LD 0x6000 +#define INSN_MASK_C_LD 0xe003 +#define INSN_MATCH_C_SD 0xe000 +#define INSN_MASK_C_SD 0xe003 +#define INSN_MATCH_C_LW 0x4000 +#define INSN_MASK_C_LW 0xe003 +#define INSN_MATCH_C_SW 0xc000 +#define INSN_MASK_C_SW 0xe003 +#define INSN_MATCH_C_LDSP 0x6002 +#define INSN_MASK_C_LDSP 0xe003 +#define INSN_MATCH_C_SDSP 0xe002 +#define INSN_MASK_C_SDSP 0xe003 +#define INSN_MATCH_C_LWSP 0x4002 +#define INSN_MASK_C_LWSP 0xe003 +#define INSN_MATCH_C_SWSP 0xc002 +#define INSN_MASK_C_SWSP 0xe003 + +#define INSN_16BIT_MASK 0x3 + +#define INSN_IS_16BIT(insn) (((insn) & INSN_16BIT_MASK) != INSN_16BIT_MASK) + +#define INSN_LEN(insn) (INSN_IS_16BIT(insn) ? 2 : 4) + +#ifdef CONFIG_64BIT +#define LOG_REGBYTES 3 +#else +#define LOG_REGBYTES 2 +#endif +#define REGBYTES (1 << LOG_REGBYTES) + +#define SH_RD 7 +#define SH_RS1 15 +#define SH_RS2 20 +#define SH_RS2C 2 +#define MASK_RX 0x1f + +#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) +#define RVC_LW_IMM(x) ((RV_X(x, 6, 1) << 2) | \ + (RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 1) << 6)) +#define RVC_LD_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 5, 2) << 6)) +#define RVC_LWSP_IMM(x) ((RV_X(x, 4, 3) << 2) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 2) << 6)) +#define RVC_LDSP_IMM(x) ((RV_X(x, 5, 2) << 3) | \ + (RV_X(x, 12, 1) << 5) | \ + (RV_X(x, 2, 3) << 6)) +#define RVC_SWSP_IMM(x) ((RV_X(x, 9, 4) << 2) | \ + (RV_X(x, 7, 2) << 6)) +#define RVC_SDSP_IMM(x) ((RV_X(x, 10, 3) << 3) | \ + (RV_X(x, 7, 3) << 6)) +#define RVC_RS1S(insn) (8 + RV_X(insn, SH_RD, 3)) +#define RVC_RS2S(insn) (8 + RV_X(insn, SH_RS2C, 3)) +#define RVC_RS2(insn) RV_X(insn, SH_RS2C, 5) + +#define SHIFT_RIGHT(x, y) \ + ((y) < 0 ? ((x) << -(y)) : ((x) >> (y))) + +#define REG_MASK \ + ((1 << (5 + LOG_REGBYTES)) - (1 << LOG_REGBYTES)) + +#define REG_OFFSET(insn, pos) \ + (SHIFT_RIGHT((insn), (pos) - LOG_REGBYTES) & REG_MASK) + +#define REG_PTR(insn, pos, regs) \ + ((ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))) + +#define GET_FUNCT3(insn) (((insn) >> 12) & 7) + +#define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) +#define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) +#define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) +#define GET_RS2S(insn, regs) (*REG_PTR(RVC_RS2S(insn), 0, regs)) +#define GET_RS2C(insn, regs) (*REG_PTR(insn, SH_RS2C, regs)) +#define GET_SP(regs) (*REG_PTR(2, 0, regs)) +#define SET_RD(insn, regs, val) (*REG_PTR(insn, SH_RD, regs) = (val)) +#define IMM_I(insn) ((s32)(insn) >> 20) +#define IMM_S(insn) (((s32)(insn) >> 25 << 5) | \ + (s32)(((insn) >> 7) & 0x1f)) + +struct insn_func { + unsigned long mask; + unsigned long match; + /* + * Possible return values are as follows: + * 1) Returns < 0 for error case + * 2) Returns 0 for exit to user-space + * 3) Returns 1 to continue with next sepc + * 4) Returns 2 to continue with same sepc + * 5) Returns 3 to inject illegal instruction trap and continue + * 6) Returns 4 to inject virtual instruction trap and continue + * + * Use enum kvm_insn_return for return values + */ + int (*func)(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn); +}; + +static int truly_illegal_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, + ulong insn) +{ + struct kvm_cpu_trap utrap = { 0 }; + + /* Redirect trap to Guest VCPU */ + utrap.sepc = vcpu->arch.guest_context.sepc; + utrap.scause = EXC_INST_ILLEGAL; + utrap.stval = insn; + utrap.htval = 0; + utrap.htinst = 0; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + + return 1; +} + +static int truly_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, + ulong insn) +{ + struct kvm_cpu_trap utrap = { 0 }; + + /* Redirect trap to Guest VCPU */ + utrap.sepc = vcpu->arch.guest_context.sepc; + utrap.scause = EXC_VIRTUAL_INST_FAULT; + utrap.stval = insn; + utrap.htval = 0; + utrap.htinst = 0; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + + return 1; +} + +/** + * kvm_riscv_vcpu_wfi -- Emulate wait for interrupt (WFI) behaviour + * + * @vcpu: The VCPU pointer + */ +void kvm_riscv_vcpu_wfi(struct kvm_vcpu *vcpu) +{ + if (!kvm_arch_vcpu_runnable(vcpu)) { + kvm_vcpu_srcu_read_unlock(vcpu); + kvm_vcpu_halt(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); + } +} + +static int wfi_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn) +{ + vcpu->stat.wfi_exit_stat++; + kvm_riscv_vcpu_wfi(vcpu); + return KVM_INSN_CONTINUE_NEXT_SEPC; +} + +struct csr_func { + unsigned int base; + unsigned int count; + /* + * Possible return values are as same as "func" callback in + * "struct insn_func". + */ + int (*func)(struct kvm_vcpu *vcpu, unsigned int csr_num, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask); +}; + +static const struct csr_func csr_funcs[] = { }; + +/** + * kvm_riscv_vcpu_csr_return -- Handle CSR read/write after user space + * emulation or in-kernel emulation + * + * @vcpu: The VCPU pointer + * @run: The VCPU run struct containing the CSR data + * + * Returns > 0 upon failure and 0 upon success + */ +int kvm_riscv_vcpu_csr_return(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + ulong insn; + + if (vcpu->arch.csr_decode.return_handled) + return 0; + vcpu->arch.csr_decode.return_handled = 1; + + /* Update destination register for CSR reads */ + insn = vcpu->arch.csr_decode.insn; + if ((insn >> SH_RD) & MASK_RX) + SET_RD(insn, &vcpu->arch.guest_context, + run->riscv_csr.ret_value); + + /* Move to next instruction */ + vcpu->arch.guest_context.sepc += INSN_LEN(insn); + + return 0; +} + +static int csr_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, ulong insn) +{ + int i, rc = KVM_INSN_ILLEGAL_TRAP; + unsigned int csr_num = insn >> SH_RS2; + unsigned int rs1_num = (insn >> SH_RS1) & MASK_RX; + ulong rs1_val = GET_RS1(insn, &vcpu->arch.guest_context); + const struct csr_func *tcfn, *cfn = NULL; + ulong val = 0, wr_mask = 0, new_val = 0; + + /* Decode the CSR instruction */ + switch (GET_FUNCT3(insn)) { + case GET_FUNCT3(INSN_MATCH_CSRRW): + wr_mask = -1UL; + new_val = rs1_val; + break; + case GET_FUNCT3(INSN_MATCH_CSRRS): + wr_mask = rs1_val; + new_val = -1UL; + break; + case GET_FUNCT3(INSN_MATCH_CSRRC): + wr_mask = rs1_val; + new_val = 0; + break; + case GET_FUNCT3(INSN_MATCH_CSRRWI): + wr_mask = -1UL; + new_val = rs1_num; + break; + case GET_FUNCT3(INSN_MATCH_CSRRSI): + wr_mask = rs1_num; + new_val = -1UL; + break; + case GET_FUNCT3(INSN_MATCH_CSRRCI): + wr_mask = rs1_num; + new_val = 0; + break; + default: + return rc; + } + + /* Save instruction decode info */ + vcpu->arch.csr_decode.insn = insn; + vcpu->arch.csr_decode.return_handled = 0; + + /* Update CSR details in kvm_run struct */ + run->riscv_csr.csr_num = csr_num; + run->riscv_csr.new_value = new_val; + run->riscv_csr.write_mask = wr_mask; + run->riscv_csr.ret_value = 0; + + /* Find in-kernel CSR function */ + for (i = 0; i < ARRAY_SIZE(csr_funcs); i++) { + tcfn = &csr_funcs[i]; + if ((tcfn->base <= csr_num) && + (csr_num < (tcfn->base + tcfn->count))) { + cfn = tcfn; + break; + } + } + + /* First try in-kernel CSR emulation */ + if (cfn && cfn->func) { + rc = cfn->func(vcpu, csr_num, &val, new_val, wr_mask); + if (rc > KVM_INSN_EXIT_TO_USER_SPACE) { + if (rc == KVM_INSN_CONTINUE_NEXT_SEPC) { + run->riscv_csr.ret_value = val; + vcpu->stat.csr_exit_kernel++; + kvm_riscv_vcpu_csr_return(vcpu, run); + rc = KVM_INSN_CONTINUE_SAME_SEPC; + } + return rc; + } + } + + /* Exit to user-space for CSR emulation */ + if (rc <= KVM_INSN_EXIT_TO_USER_SPACE) { + vcpu->stat.csr_exit_user++; + run->exit_reason = KVM_EXIT_RISCV_CSR; + } + + return rc; +} + +static const struct insn_func system_opcode_funcs[] = { + { + .mask = INSN_MASK_CSRRW, + .match = INSN_MATCH_CSRRW, + .func = csr_insn, + }, + { + .mask = INSN_MASK_CSRRS, + .match = INSN_MATCH_CSRRS, + .func = csr_insn, + }, + { + .mask = INSN_MASK_CSRRC, + .match = INSN_MATCH_CSRRC, + .func = csr_insn, + }, + { + .mask = INSN_MASK_CSRRWI, + .match = INSN_MATCH_CSRRWI, + .func = csr_insn, + }, + { + .mask = INSN_MASK_CSRRSI, + .match = INSN_MATCH_CSRRSI, + .func = csr_insn, + }, + { + .mask = INSN_MASK_CSRRCI, + .match = INSN_MATCH_CSRRCI, + .func = csr_insn, + }, + { + .mask = INSN_MASK_WFI, + .match = INSN_MATCH_WFI, + .func = wfi_insn, + }, +}; + +static int system_opcode_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, + ulong insn) +{ + int i, rc = KVM_INSN_ILLEGAL_TRAP; + const struct insn_func *ifn; + + for (i = 0; i < ARRAY_SIZE(system_opcode_funcs); i++) { + ifn = &system_opcode_funcs[i]; + if ((insn & ifn->mask) == ifn->match) { + rc = ifn->func(vcpu, run, insn); + break; + } + } + + switch (rc) { + case KVM_INSN_ILLEGAL_TRAP: + return truly_illegal_insn(vcpu, run, insn); + case KVM_INSN_VIRTUAL_TRAP: + return truly_virtual_insn(vcpu, run, insn); + case KVM_INSN_CONTINUE_NEXT_SEPC: + vcpu->arch.guest_context.sepc += INSN_LEN(insn); + break; + default: + break; + } + + return (rc <= 0) ? rc : 1; +} + +/** + * kvm_riscv_vcpu_virtual_insn -- Handle virtual instruction trap + * + * @vcpu: The VCPU pointer + * @run: The VCPU run struct containing the mmio data + * @trap: Trap details + * + * Returns > 0 to continue run-loop + * Returns 0 to exit run-loop and handle in user-space. + * Returns < 0 to report failure and exit run-loop + */ +int kvm_riscv_vcpu_virtual_insn(struct kvm_vcpu *vcpu, struct kvm_run *run, + struct kvm_cpu_trap *trap) +{ + unsigned long insn = trap->stval; + struct kvm_cpu_trap utrap = { 0 }; + struct kvm_cpu_context *ct; + + if (unlikely(INSN_IS_16BIT(insn))) { + if (insn == 0) { + ct = &vcpu->arch.guest_context; + insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, + ct->sepc, + &utrap); + if (utrap.scause) { + utrap.sepc = ct->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + return 1; + } + } + if (INSN_IS_16BIT(insn)) + return truly_illegal_insn(vcpu, run, insn); + } + + switch ((insn & INSN_OPCODE_MASK) >> INSN_OPCODE_SHIFT) { + case INSN_OPCODE_SYSTEM: + return system_opcode_insn(vcpu, run, insn); + default: + return truly_illegal_insn(vcpu, run, insn); + } +} + +/** + * kvm_riscv_vcpu_mmio_load -- Emulate MMIO load instruction + * + * @vcpu: The VCPU pointer + * @run: The VCPU run struct containing the mmio data + * @fault_addr: Guest physical address to load + * @htinst: Transformed encoding of the load instruction + * + * Returns > 0 to continue run-loop + * Returns 0 to exit run-loop and handle in user-space. + * Returns < 0 to report failure and exit run-loop + */ +int kvm_riscv_vcpu_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run, + unsigned long fault_addr, + unsigned long htinst) +{ + u8 data_buf[8]; + unsigned long insn; + int shift = 0, len = 0, insn_len = 0; + struct kvm_cpu_trap utrap = { 0 }; + struct kvm_cpu_context *ct = &vcpu->arch.guest_context; + + /* Determine trapped instruction */ + if (htinst & 0x1) { + /* + * Bit[0] == 1 implies trapped instruction value is + * transformed instruction or custom instruction. + */ + insn = htinst | INSN_16BIT_MASK; + insn_len = (htinst & BIT(1)) ? INSN_LEN(insn) : 2; + } else { + /* + * Bit[0] == 0 implies trapped instruction value is + * zero or special value. + */ + insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, + &utrap); + if (utrap.scause) { + /* Redirect trap if we failed to read instruction */ + utrap.sepc = ct->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + return 1; + } + insn_len = INSN_LEN(insn); + } + + /* Decode length of MMIO and shift */ + if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) { + len = 4; + shift = 8 * (sizeof(ulong) - len); + } else if ((insn & INSN_MASK_LB) == INSN_MATCH_LB) { + len = 1; + shift = 8 * (sizeof(ulong) - len); + } else if ((insn & INSN_MASK_LBU) == INSN_MATCH_LBU) { + len = 1; + shift = 8 * (sizeof(ulong) - len); +#ifdef CONFIG_64BIT + } else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) { + len = 8; + shift = 8 * (sizeof(ulong) - len); + } else if ((insn & INSN_MASK_LWU) == INSN_MATCH_LWU) { + len = 4; +#endif + } else if ((insn & INSN_MASK_LH) == INSN_MATCH_LH) { + len = 2; + shift = 8 * (sizeof(ulong) - len); + } else if ((insn & INSN_MASK_LHU) == INSN_MATCH_LHU) { + len = 2; +#ifdef CONFIG_64BIT + } else if ((insn & INSN_MASK_C_LD) == INSN_MATCH_C_LD) { + len = 8; + shift = 8 * (sizeof(ulong) - len); + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_LDSP) == INSN_MATCH_C_LDSP && + ((insn >> SH_RD) & 0x1f)) { + len = 8; + shift = 8 * (sizeof(ulong) - len); +#endif + } else if ((insn & INSN_MASK_C_LW) == INSN_MATCH_C_LW) { + len = 4; + shift = 8 * (sizeof(ulong) - len); + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_LWSP) == INSN_MATCH_C_LWSP && + ((insn >> SH_RD) & 0x1f)) { + len = 4; + shift = 8 * (sizeof(ulong) - len); + } else { + return -EOPNOTSUPP; + } + + /* Fault address should be aligned to length of MMIO */ + if (fault_addr & (len - 1)) + return -EIO; + + /* Save instruction decode info */ + vcpu->arch.mmio_decode.insn = insn; + vcpu->arch.mmio_decode.insn_len = insn_len; + vcpu->arch.mmio_decode.shift = shift; + vcpu->arch.mmio_decode.len = len; + vcpu->arch.mmio_decode.return_handled = 0; + + /* Update MMIO details in kvm_run struct */ + run->mmio.is_write = false; + run->mmio.phys_addr = fault_addr; + run->mmio.len = len; + + /* Try to handle MMIO access in the kernel */ + if (!kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_addr, len, data_buf)) { + /* Successfully handled MMIO access in the kernel so resume */ + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_kernel++; + kvm_riscv_vcpu_mmio_return(vcpu, run); + return 1; + } + + /* Exit to userspace for MMIO emulation */ + vcpu->stat.mmio_exit_user++; + run->exit_reason = KVM_EXIT_MMIO; + + return 0; +} + +/** + * kvm_riscv_vcpu_mmio_store -- Emulate MMIO store instruction + * + * @vcpu: The VCPU pointer + * @run: The VCPU run struct containing the mmio data + * @fault_addr: Guest physical address to store + * @htinst: Transformed encoding of the store instruction + * + * Returns > 0 to continue run-loop + * Returns 0 to exit run-loop and handle in user-space. + * Returns < 0 to report failure and exit run-loop + */ +int kvm_riscv_vcpu_mmio_store(struct kvm_vcpu *vcpu, struct kvm_run *run, + unsigned long fault_addr, + unsigned long htinst) +{ + u8 data8; + u16 data16; + u32 data32; + u64 data64; + ulong data; + unsigned long insn; + int len = 0, insn_len = 0; + struct kvm_cpu_trap utrap = { 0 }; + struct kvm_cpu_context *ct = &vcpu->arch.guest_context; + + /* Determine trapped instruction */ + if (htinst & 0x1) { + /* + * Bit[0] == 1 implies trapped instruction value is + * transformed instruction or custom instruction. + */ + insn = htinst | INSN_16BIT_MASK; + insn_len = (htinst & BIT(1)) ? INSN_LEN(insn) : 2; + } else { + /* + * Bit[0] == 0 implies trapped instruction value is + * zero or special value. + */ + insn = kvm_riscv_vcpu_unpriv_read(vcpu, true, ct->sepc, + &utrap); + if (utrap.scause) { + /* Redirect trap if we failed to read instruction */ + utrap.sepc = ct->sepc; + kvm_riscv_vcpu_trap_redirect(vcpu, &utrap); + return 1; + } + insn_len = INSN_LEN(insn); + } + + data = GET_RS2(insn, &vcpu->arch.guest_context); + data8 = data16 = data32 = data64 = data; + + if ((insn & INSN_MASK_SW) == INSN_MATCH_SW) { + len = 4; + } else if ((insn & INSN_MASK_SB) == INSN_MATCH_SB) { + len = 1; +#ifdef CONFIG_64BIT + } else if ((insn & INSN_MASK_SD) == INSN_MATCH_SD) { + len = 8; +#endif + } else if ((insn & INSN_MASK_SH) == INSN_MATCH_SH) { + len = 2; +#ifdef CONFIG_64BIT + } else if ((insn & INSN_MASK_C_SD) == INSN_MATCH_C_SD) { + len = 8; + data64 = GET_RS2S(insn, &vcpu->arch.guest_context); + } else if ((insn & INSN_MASK_C_SDSP) == INSN_MATCH_C_SDSP && + ((insn >> SH_RD) & 0x1f)) { + len = 8; + data64 = GET_RS2C(insn, &vcpu->arch.guest_context); +#endif + } else if ((insn & INSN_MASK_C_SW) == INSN_MATCH_C_SW) { + len = 4; + data32 = GET_RS2S(insn, &vcpu->arch.guest_context); + } else if ((insn & INSN_MASK_C_SWSP) == INSN_MATCH_C_SWSP && + ((insn >> SH_RD) & 0x1f)) { + len = 4; + data32 = GET_RS2C(insn, &vcpu->arch.guest_context); + } else { + return -EOPNOTSUPP; + } + + /* Fault address should be aligned to length of MMIO */ + if (fault_addr & (len - 1)) + return -EIO; + + /* Save instruction decode info */ + vcpu->arch.mmio_decode.insn = insn; + vcpu->arch.mmio_decode.insn_len = insn_len; + vcpu->arch.mmio_decode.shift = 0; + vcpu->arch.mmio_decode.len = len; + vcpu->arch.mmio_decode.return_handled = 0; + + /* Copy data to kvm_run instance */ + switch (len) { + case 1: + *((u8 *)run->mmio.data) = data8; + break; + case 2: + *((u16 *)run->mmio.data) = data16; + break; + case 4: + *((u32 *)run->mmio.data) = data32; + break; + case 8: + *((u64 *)run->mmio.data) = data64; + break; + default: + return -EOPNOTSUPP; + } + + /* Update MMIO details in kvm_run struct */ + run->mmio.is_write = true; + run->mmio.phys_addr = fault_addr; + run->mmio.len = len; + + /* Try to handle MMIO access in the kernel */ + if (!kvm_io_bus_write(vcpu, KVM_MMIO_BUS, + fault_addr, len, run->mmio.data)) { + /* Successfully handled MMIO access in the kernel so resume */ + vcpu->stat.mmio_exit_kernel++; + kvm_riscv_vcpu_mmio_return(vcpu, run); + return 1; + } + + /* Exit to userspace for MMIO emulation */ + vcpu->stat.mmio_exit_user++; + run->exit_reason = KVM_EXIT_MMIO; + + return 0; +} + +/** + * kvm_riscv_vcpu_mmio_return -- Handle MMIO loads after user space emulation + * or in-kernel IO emulation + * + * @vcpu: The VCPU pointer + * @run: The VCPU run struct containing the mmio data + */ +int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + u8 data8; + u16 data16; + u32 data32; + u64 data64; + ulong insn; + int len, shift; + + if (vcpu->arch.mmio_decode.return_handled) + return 0; + + vcpu->arch.mmio_decode.return_handled = 1; + insn = vcpu->arch.mmio_decode.insn; + + if (run->mmio.is_write) + goto done; + + len = vcpu->arch.mmio_decode.len; + shift = vcpu->arch.mmio_decode.shift; + + switch (len) { + case 1: + data8 = *((u8 *)run->mmio.data); + SET_RD(insn, &vcpu->arch.guest_context, + (ulong)data8 << shift >> shift); + break; + case 2: + data16 = *((u16 *)run->mmio.data); + SET_RD(insn, &vcpu->arch.guest_context, + (ulong)data16 << shift >> shift); + break; + case 4: + data32 = *((u32 *)run->mmio.data); + SET_RD(insn, &vcpu->arch.guest_context, + (ulong)data32 << shift >> shift); + break; + case 8: + data64 = *((u64 *)run->mmio.data); + SET_RD(insn, &vcpu->arch.guest_context, + (ulong)data64 << shift >> shift); + break; + default: + return -EOPNOTSUPP; + } + +done: + /* Move to next instruction */ + vcpu->arch.guest_context.sepc += vcpu->arch.mmio_decode.insn_len; + + return 0; +} diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index 5c4c37ff2d48..595043857049 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -214,12 +214,10 @@ void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu) #endif } -int kvm_riscv_guest_timer_init(struct kvm *kvm) +void kvm_riscv_guest_timer_init(struct kvm *kvm) { struct kvm_guest_timer *gt = &kvm->arch.timer; riscv_cs_get_mult_shift(>->nsec_mult, >->nsec_shift); gt->time_delta = -get_cycles64(); - - return 0; } diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 945a2bf5e3f6..65a964d7e70d 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -41,7 +41,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return r; } - return kvm_riscv_guest_timer_init(kvm); + kvm_riscv_guest_timer_init(kvm); + + return 0; } void kvm_arch_destroy_vm(struct kvm *kvm) diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index e6be155ab2e5..a5fa667160b2 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -41,6 +41,12 @@ void uv_query_info(void) uv_info.max_num_sec_conf = uvcb.max_num_sec_conf; uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id; uv_info.uv_feature_indications = uvcb.uv_feature_indications; + uv_info.supp_se_hdr_ver = uvcb.supp_se_hdr_versions; + uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf; + uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len; + uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len; + uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver; + uv_info.supp_att_pflags = uvcb.supp_att_pflags; } #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index 01936fdfaddb..e82e5626e139 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -12,10 +12,11 @@ #include <linux/bit_spinlock.h> #include <linux/dma-mapping.h> +#include <asm/tpi.h> struct airq_struct { struct hlist_node list; /* Handler queueing. */ - void (*handler)(struct airq_struct *airq, bool floating); + void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info); u8 *lsi_ptr; /* Local-Summary-Indicator pointer */ u8 lsi_mask; /* Local-Summary-Indicator mask */ u8 isc; /* Interrupt-subclass */ @@ -46,8 +47,10 @@ struct airq_iv { #define AIRQ_IV_PTR 4 /* Allocate the ptr array */ #define AIRQ_IV_DATA 8 /* Allocate the data array */ #define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */ +#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */ -struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags); +struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags, + unsigned long *vec); void airq_iv_release(struct airq_iv *iv); unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num); void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num); diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 40264f60b0da..5cc46e0dde62 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -147,5 +147,42 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start, void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); int gmap_mark_unmergeable(void); -void s390_reset_acc(struct mm_struct *mm); +void s390_unlist_old_asce(struct gmap *gmap); +int s390_replace_asce(struct gmap *gmap); +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible); + +/** + * s390_uv_destroy_range - Destroy a range of pages in the given mm. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls, but + * it will otherwise only return when it completed. + */ +static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + (void)__s390_uv_destroy_range(mm, start, end, false); +} + +/** + * s390_uv_destroy_range_interruptible - Destroy a range of pages in the + * given mm, but stop when a fatal signal is received. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls. If + * a fatal signal is received, it will return with -EINTR immediately, + * without finishing destroying the whole range. Upon successful + * completion, 0 is returned. + */ +static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + return __s390_uv_destroy_range(mm, start, end, true); +} #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 766028d54a3e..f39092e0ceaa 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -19,6 +19,8 @@ #include <linux/kvm.h> #include <linux/seqlock.h> #include <linux/module.h> +#include <linux/pci.h> +#include <linux/mmu_notifier.h> #include <asm/debug.h> #include <asm/cpu.h> #include <asm/fpu/api.h> @@ -93,19 +95,30 @@ union ipte_control { }; }; +union sca_utility { + __u16 val; + struct { + __u16 mtcr : 1; + __u16 reserved : 15; + }; +}; + struct bsca_block { union ipte_control ipte_control; __u64 reserved[5]; __u64 mcn; - __u64 reserved2; + union sca_utility utility; + __u8 reserved2[6]; struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; }; struct esca_block { union ipte_control ipte_control; - __u64 reserved1[7]; + __u64 reserved1[6]; + union sca_utility utility; + __u8 reserved2[6]; __u64 mcn[4]; - __u64 reserved2[20]; + __u64 reserved3[20]; struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; }; @@ -249,12 +262,16 @@ struct kvm_s390_sie_block { #define ECB_SPECI 0x08 #define ECB_SRSI 0x04 #define ECB_HOSTPROTINT 0x02 +#define ECB_PTF 0x01 __u8 ecb; /* 0x0061 */ #define ECB2_CMMA 0x80 #define ECB2_IEP 0x20 #define ECB2_PFMFI 0x08 #define ECB2_ESCA 0x04 +#define ECB2_ZPCI_LSI 0x02 __u8 ecb2; /* 0x0062 */ +#define ECB3_AISI 0x20 +#define ECB3_AISII 0x10 #define ECB3_DEA 0x08 #define ECB3_AES 0x04 #define ECB3_RI 0x01 @@ -759,6 +776,7 @@ struct kvm_vm_stat { u64 inject_pfault_done; u64 inject_service_signal; u64 inject_virtio; + u64 aen_forward; }; struct kvm_arch_memory_slot { @@ -923,6 +941,8 @@ struct kvm_s390_pv { u64 guest_len; unsigned long stor_base; void *stor_var; + bool dumping; + struct mmu_notifier mmu_notifier; }; struct kvm_arch{ @@ -939,6 +959,7 @@ struct kvm_arch{ int use_cmma; int use_pfmfi; int use_skf; + int use_zpci_interp; int user_cpu_state_ctrl; int user_sigp; int user_stsi; @@ -962,6 +983,8 @@ struct kvm_arch{ DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; struct kvm_s390_pv pv; + struct list_head kzdev_list; + spinlock_t kzdev_list_lock; }; #define KVM_HVA_ERR_BAD (-1UL) @@ -1012,4 +1035,19 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +#define __KVM_HAVE_ARCH_VM_FREE +void kvm_arch_free_vm(struct kvm *kvm); + +#ifdef CONFIG_VFIO_PCI_ZDEV_KVM +int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm); +void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev); +#else +static inline int kvm_s390_pci_register_kvm(struct zpci_dev *dev, + struct kvm *kvm) +{ + return -EPERM; +} +static inline void kvm_s390_pci_unregister_kvm(struct zpci_dev *dev) {} +#endif + #endif diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 82aae78e1315..1572b3634cdd 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -18,7 +18,7 @@ typedef struct { unsigned long asce_limit; unsigned long vdso_base; /* The mmu context belongs to a secure guest. */ - atomic_t is_protected; + atomic_t protected_count; /* * The following bitfields need a down_write on the mm * semaphore when they are written to. As they are only diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index c7937f369e62..2a38af5a00c2 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -26,7 +26,7 @@ static inline int init_new_context(struct task_struct *tsk, INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); - atomic_set(&mm->context.is_protected, 0); + atomic_set(&mm->context.protected_count, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; #ifdef CONFIG_PGSTE diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index fdb9745ee998..85eb0ef9d4c3 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -9,6 +9,7 @@ #include <asm-generic/pci.h> #include <asm/pci_clp.h> #include <asm/pci_debug.h> +#include <asm/pci_insn.h> #include <asm/sclp.h> #define PCIBIOS_MIN_IO 0x1000 @@ -97,6 +98,7 @@ struct zpci_bar_struct { }; struct s390_domain; +struct kvm_zdev; #define ZPCI_FUNCTIONS_PER_BUS 256 struct zpci_bus { @@ -123,11 +125,14 @@ struct zpci_dev { enum zpci_state state; u32 fid; /* function ID, used by sclp */ u32 fh; /* function handle, used by insn's */ + u32 gisa; /* GISA designation for passthrough */ u16 vfn; /* virtual function number */ u16 pchid; /* physical channel ID */ + u16 maxstbl; /* Maximum store block size */ u8 pfgid; /* function group ID */ u8 pft; /* pci function type */ u8 port; + u8 dtsm; /* Supported DT mask */ u8 rid_available : 1; u8 has_hp_slot : 1; u8 has_resources : 1; @@ -186,7 +191,10 @@ struct zpci_dev { struct dentry *debugfs_dev; + /* IOMMU and passthrough */ struct s390_domain *s390_domain; /* s390 IOMMU domain data */ + struct kvm_zdev *kzdev; + struct mutex kzdev_lock; }; static inline bool zdev_enabled(struct zpci_dev *zdev) @@ -198,6 +206,9 @@ extern const struct attribute_group *zpci_attr_groups[]; extern unsigned int s390_pci_force_floating __initdata; extern unsigned int s390_pci_no_rid; +extern union zpci_sic_iib *zpci_aipb; +extern struct airq_iv *zpci_aif_sbv; + /* ----------------------------------------------------------------------------- Prototypes ----------------------------------------------------------------------------- */ diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index 1f4b666e85ee..d6189ed14f84 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -153,9 +153,11 @@ struct clp_rsp_query_pci_grp { u8 : 6; u8 frame : 1; u8 refresh : 1; /* TLB refresh mode */ - u16 reserved2; + u16 : 3; + u16 maxstbl : 13; /* Maximum store block size */ u16 mui; - u16 : 16; + u8 dtsm; /* Supported DT mask */ + u8 reserved3; u16 maxfaal; u16 : 4; u16 dnoi : 12; @@ -173,7 +175,8 @@ struct clp_req_set_pci { u16 reserved2; u8 oc; /* operation controls */ u8 ndas; /* number of dma spaces */ - u64 reserved3; + u32 reserved3; + u32 gisa; /* GISA designation */ } __packed; /* Set PCI function response */ diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index 61cf9531f68f..e5f57cfe1d45 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -98,6 +98,15 @@ struct zpci_fib { u32 gd; } __packed __aligned(8); +/* Set Interruption Controls Operation Controls */ +#define SIC_IRQ_MODE_ALL 0 +#define SIC_IRQ_MODE_SINGLE 1 +#define SIC_SET_AENI_CONTROLS 2 +#define SIC_IRQ_MODE_DIRECT 4 +#define SIC_IRQ_MODE_D_ALL 16 +#define SIC_IRQ_MODE_D_SINGLE 17 +#define SIC_IRQ_MODE_SET_CPU 18 + /* directed interruption information block */ struct zpci_diib { u32 : 1; @@ -119,9 +128,20 @@ struct zpci_cdiib { u64 : 64; } __packed __aligned(8); +/* adapter interruption parameters block */ +struct zpci_aipb { + u64 faisb; + u64 gait; + u16 : 13; + u16 afi : 3; + u32 : 32; + u16 faal; +} __packed __aligned(8); + union zpci_sic_iib { struct zpci_diib diib; struct zpci_cdiib cdiib; + struct zpci_aipb aipb; }; DECLARE_STATIC_KEY_FALSE(have_mio); @@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset); int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len); int __zpci_store_block(const u64 *data, u64 req, u64 offset); void zpci_barrier(void); -int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); - -static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) -{ - union zpci_sic_iib iib = {{0}}; - - return __zpci_set_irq_ctrl(ctl, isc, &iib); -} +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); #endif diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a397b072a580..cf81acf3879c 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -525,7 +525,7 @@ static inline int mm_has_pgste(struct mm_struct *mm) static inline int mm_is_protected(struct mm_struct *mm) { #ifdef CONFIG_PGSTE - if (unlikely(atomic_read(&mm->context.is_protected))) + if (unlikely(atomic_read(&mm->context.protected_count))) return 1; #endif return 0; @@ -1182,9 +1182,22 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } else { res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); } - /* At this point the reference through the mapping is still present */ - if (mm_is_protected(mm) && pte_present(res)) - uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + /* Nothing to do */ + if (!mm_is_protected(mm) || !pte_present(res)) + return res; + /* + * At this point the reference through the mapping is still present. + * The notifier should have destroyed all protected vCPUs at this + * point, so the destroy should be successful. + */ + if (full && !uv_destroy_owned_page(pte_val(res) & PAGE_MASK)) + return res; + /* + * If something went wrong and the page could not be destroyed, or + * if this is not a mm teardown, the slower export is used as + * fallback instead. + */ + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); return res; } diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 236b34b75ddb..addefe8ccdba 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -88,6 +88,10 @@ struct sclp_info { unsigned char has_sipl : 1; unsigned char has_dirq : 1; unsigned char has_iplcc : 1; + unsigned char has_zpci_lsi : 1; + unsigned char has_aisii : 1; + unsigned char has_aeni : 1; + unsigned char has_aisi : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h index 1ac538b8cbf5..f76e5fdff23a 100644 --- a/arch/s390/include/asm/tpi.h +++ b/arch/s390/include/asm/tpi.h @@ -19,6 +19,19 @@ struct tpi_info { u32 :12; } __packed __aligned(4); +/* I/O-Interruption Code as stored by TPI for an Adapter I/O */ +struct tpi_adapter_info { + u32 aism:8; + u32 :22; + u32 error:1; + u32 forward:1; + u32 reserved; + u32 adapter_IO:1; + u32 directed_irq:1; + u32 isc:3; + u32 :27; +} __packed __aligned(4); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_TPI_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index cfea7b77a5b8..be3ef9dd6972 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -50,6 +50,10 @@ #define UVC_CMD_SET_UNSHARE_ALL 0x0340 #define UVC_CMD_PIN_PAGE_SHARED 0x0341 #define UVC_CMD_UNPIN_PAGE_SHARED 0x0342 +#define UVC_CMD_DUMP_INIT 0x0400 +#define UVC_CMD_DUMP_CONF_STOR_STATE 0x0401 +#define UVC_CMD_DUMP_CPU 0x0402 +#define UVC_CMD_DUMP_COMPLETE 0x0403 #define UVC_CMD_SET_SHARED_ACCESS 0x1000 #define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 #define UVC_CMD_RETR_ATTEST 0x1020 @@ -77,6 +81,10 @@ enum uv_cmds_inst { BIT_UVC_CMD_UNSHARE_ALL = 20, BIT_UVC_CMD_PIN_PAGE_SHARED = 21, BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, + BIT_UVC_CMD_DUMP_INIT = 24, + BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE = 25, + BIT_UVC_CMD_DUMP_CPU = 26, + BIT_UVC_CMD_DUMP_COMPLETE = 27, BIT_UVC_CMD_RETR_ATTEST = 28, }; @@ -110,7 +118,16 @@ struct uv_cb_qui { u8 reserved88[158 - 136]; /* 0x0088 */ u16 max_guest_cpu_id; /* 0x009e */ u64 uv_feature_indications; /* 0x00a0 */ - u8 reserveda8[200 - 168]; /* 0x00a8 */ + u64 reserveda8; /* 0x00a8 */ + u64 supp_se_hdr_versions; /* 0x00b0 */ + u64 supp_se_hdr_pcf; /* 0x00b8 */ + u64 reservedc0; /* 0x00c0 */ + u64 conf_dump_storage_state_len; /* 0x00c8 */ + u64 conf_dump_finalize_len; /* 0x00d0 */ + u64 reservedd8; /* 0x00d8 */ + u64 supp_att_req_hdr_ver; /* 0x00e0 */ + u64 supp_att_pflags; /* 0x00e8 */ + u8 reservedf0[256 - 240]; /* 0x00f0 */ } __packed __aligned(8); /* Initialize Ultravisor */ @@ -240,6 +257,31 @@ struct uv_cb_attest { u64 reserved168[4]; /* 0x0168 */ } __packed __aligned(8); +struct uv_cb_dump_cpu { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u64 dump_area_origin; + u64 reserved28[5]; +} __packed __aligned(8); + +struct uv_cb_dump_stor_state { + struct uv_cb_header header; + u64 reserved08[2]; + u64 config_handle; + u64 dump_area_origin; + u64 gaddr; + u64 reserved28[4]; +} __packed __aligned(8); + +struct uv_cb_dump_complete { + struct uv_cb_header header; + u64 reserved08[2]; + u64 config_handle; + u64 dump_area_origin; + u64 reserved30[5]; +} __packed __aligned(8); + static inline int __uv_call(unsigned long r1, unsigned long r2) { int cc; @@ -307,6 +349,12 @@ struct uv_info { unsigned int max_num_sec_conf; unsigned short max_guest_cpu_id; unsigned long uv_feature_indications; + unsigned long supp_se_hdr_ver; + unsigned long supp_se_hdr_pcf; + unsigned long conf_dump_storage_state_len; + unsigned long conf_dump_finalize_len; + unsigned long supp_att_req_hdr_ver; + unsigned long supp_att_pflags; }; extern struct uv_info uv_info; @@ -378,6 +426,7 @@ static inline int is_prot_virt_host(void) } int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); int uv_destroy_owned_page(unsigned long paddr); int uv_convert_from_secure(unsigned long paddr); int uv_convert_owned_from_secure(unsigned long paddr); diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 7a6b14874d65..a73cf01a1606 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req { #define KVM_S390_VM_CRYPTO 2 #define KVM_S390_VM_CPU_MODEL 3 #define KVM_S390_VM_MIGRATION 4 +#define KVM_S390_VM_CPU_TOPOLOGY 5 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index a5425075dd25..f9810d2a267c 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -234,6 +234,32 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr, return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } +/** + * should_export_before_import - Determine whether an export is needed + * before an import-like operation + * @uvcb: the Ultravisor control block of the UVC to be performed + * @mm: the mm of the process + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: true if an export is needed before every import, otherwise false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + /* * Requests the Ultravisor to make a page accessible to a guest. * If it's brought in the first time, it will be cleared. If @@ -277,6 +303,8 @@ again: lock_page(page); ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); + if (should_export_before_import(uvcb, gmap->mm)) + uv_convert_from_secure(page_to_phys(page)); rc = make_secure_pte(ptep, uaddr, page, uvcb); pte_unmap_unlock(ptep, ptelock); unlock_page(page); @@ -334,6 +362,61 @@ int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) } EXPORT_SYMBOL_GPL(gmap_convert_to_secure); +/** + * gmap_destroy_page - Destroy a guest page. + * @gmap: the gmap of the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + */ +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) +{ + struct vm_area_struct *vma; + unsigned long uaddr; + struct page *page; + int rc; + + rc = -EFAULT; + mmap_read_lock(gmap->mm); + + uaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(uaddr)) + goto out; + vma = vma_lookup(gmap->mm, uaddr); + if (!vma) + goto out; + /* + * Huge pages should not be able to become secure + */ + if (is_vm_hugetlb_page(vma)) + goto out; + + rc = 0; + /* we take an extra reference here */ + page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto out; + rc = uv_destroy_owned_page(page_to_phys(page)); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_owned_from_secure(page_to_phys(page)); + put_page(page); +out: + mmap_read_unlock(gmap->mm); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_destroy_page); + /* * To be called with the page locked or with an extra reference! This will * prevent gmap_make_secure from touching the page concurrently. Having 2 @@ -392,6 +475,54 @@ static ssize_t uv_query_facilities(struct kobject *kobj, static struct kobj_attribute uv_query_facilities_attr = __ATTR(facilities, 0444, uv_query_facilities, NULL); +static ssize_t uv_query_supp_se_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_se_hdr_ver_attr = + __ATTR(supp_se_hdr_ver, 0444, uv_query_supp_se_hdr_ver, NULL); + +static ssize_t uv_query_supp_se_hdr_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_se_hdr_pcf); +} + +static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr = + __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL); + +static ssize_t uv_query_dump_cpu_len(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", + uv_info.guest_cpu_stor_len); +} + +static struct kobj_attribute uv_query_dump_cpu_len_attr = + __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL); + +static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", + uv_info.conf_dump_storage_state_len); +} + +static struct kobj_attribute uv_query_dump_storage_state_len_attr = + __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL); + +static ssize_t uv_query_dump_finalize_len(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", + uv_info.conf_dump_finalize_len); +} + +static struct kobj_attribute uv_query_dump_finalize_len_attr = + __ATTR(dump_finalize_len, 0444, uv_query_dump_finalize_len, NULL); + static ssize_t uv_query_feature_indications(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -431,12 +562,37 @@ static ssize_t uv_query_max_guest_addr(struct kobject *kobj, static struct kobj_attribute uv_query_max_guest_addr_attr = __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); +static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_req_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr = + __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL); + +static ssize_t uv_query_supp_att_pflags(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_pflags); +} + +static struct kobj_attribute uv_query_supp_att_pflags_attr = + __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL); + static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, &uv_query_feature_indications_attr.attr, &uv_query_max_guest_cpus_attr.attr, &uv_query_max_guest_vms_attr.attr, &uv_query_max_guest_addr_attr.attr, + &uv_query_supp_se_hdr_ver_attr.attr, + &uv_query_supp_se_hdr_pcf_attr.attr, + &uv_query_dump_storage_state_len_attr.attr, + &uv_query_dump_finalize_len_attr.attr, + &uv_query_dump_cpu_len_attr.attr, + &uv_query_supp_att_req_hdr_ver_attr.attr, + &uv_query_supp_att_pflags_attr.attr, NULL, }; diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 2e84d3922f7c..33f4ff909476 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -34,6 +34,7 @@ config KVM select SRCU select KVM_VFIO select INTERVAL_TREE + select MMU_NOTIFIER help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 26f4a74e5ce4..02217fb4ae10 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -10,4 +10,5 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 227ed0009354..082ec5f2c3a5 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -262,77 +262,77 @@ struct aste { /* .. more fields there */ }; -int ipte_lock_held(struct kvm_vcpu *vcpu) +int ipte_lock_held(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) { + if (sclp.has_siif) { int rc; - read_lock(&vcpu->kvm->arch.sca_lock); - rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0; - read_unlock(&vcpu->kvm->arch.sca_lock); + read_lock(&kvm->arch.sca_lock); + rc = kvm_s390_get_ipte_control(kvm)->kh != 0; + read_unlock(&kvm->arch.sca_lock); return rc; } - return vcpu->kvm->arch.ipte_lock_count != 0; + return kvm->arch.ipte_lock_count != 0; } -static void ipte_lock_simple(struct kvm_vcpu *vcpu) +static void ipte_lock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count++; - if (vcpu->kvm->arch.ipte_lock_count > 1) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count++; + if (kvm->arch.ipte_lock_count > 1) goto out; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); if (old.k) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_unlock_simple(struct kvm_vcpu *vcpu) +static void ipte_unlock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count--; - if (vcpu->kvm->arch.ipte_lock_count) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count--; + if (kvm->arch.ipte_lock_count) goto out; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); new = old; new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); - wake_up(&vcpu->kvm->arch.ipte_wq); + read_unlock(&kvm->arch.sca_lock); + wake_up(&kvm->arch.ipte_wq); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_lock_siif(struct kvm_vcpu *vcpu) +static void ipte_lock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); if (old.kg) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } @@ -340,15 +340,15 @@ retry: new.k = 1; new.kh++; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); } -static void ipte_unlock_siif(struct kvm_vcpu *vcpu) +static void ipte_unlock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); new = old; @@ -356,25 +356,25 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu) if (!new.kh) new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); if (!new.kh) - wake_up(&vcpu->kvm->arch.ipte_wq); + wake_up(&kvm->arch.ipte_wq); } -void ipte_lock(struct kvm_vcpu *vcpu) +void ipte_lock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_lock_siif(vcpu); + if (sclp.has_siif) + ipte_lock_siif(kvm); else - ipte_lock_simple(vcpu); + ipte_lock_simple(kvm); } -void ipte_unlock(struct kvm_vcpu *vcpu) +void ipte_unlock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_unlock_siif(vcpu); + if (sclp.has_siif) + ipte_unlock_siif(kvm); else - ipte_unlock_simple(vcpu); + ipte_unlock_simple(kvm); } static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, @@ -1086,7 +1086,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, try_storage_prot_override = storage_prot_override_applicable(vcpu); need_ipte_lock = psw_bits(*psw).dat && !asce.r; if (need_ipte_lock) - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); /* * Since we do the access further down ultimately via a move instruction * that does key checking and returns an error in case of a protection @@ -1127,7 +1127,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, } out_unlock: if (need_ipte_lock) - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); if (nr_pages > ARRAY_SIZE(gpa_array)) vfree(gpas); return rc; @@ -1199,10 +1199,10 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); if (rc) return rc; - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode, access_key); - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); return rc; } @@ -1465,7 +1465,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, * tables/pointers we read stay valid - unshadowing is however * always possible - only guest_table_lock protects us. */ - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); if (rc) @@ -1499,7 +1499,7 @@ shadow_page: pte.p |= dat_protection; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); mmap_read_unlock(sg->mm); return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 1124ff282012..9408d6cc8e2c 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -440,9 +440,9 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, return access_guest_real(vcpu, gra, data, len, 0); } -void ipte_lock(struct kvm_vcpu *vcpu); -void ipte_unlock(struct kvm_vcpu *vcpu); -int ipte_lock_held(struct kvm_vcpu *vcpu); +void ipte_lock(struct kvm *kvm); +void ipte_unlock(struct kvm *kvm); +int ipte_lock_held(struct kvm *kvm); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); /* MVPG PEI indication bits */ diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 8bd42a20d924..88112065d941 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -528,12 +528,27 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) static int handle_pv_notification(struct kvm_vcpu *vcpu) { + int ret; + if (vcpu->arch.sie_block->ipa == 0xb210) return handle_pv_spx(vcpu); if (vcpu->arch.sie_block->ipa == 0xb220) return handle_pv_sclp(vcpu); if (vcpu->arch.sie_block->ipa == 0xb9a4) return handle_pv_uvc(vcpu); + if (vcpu->arch.sie_block->ipa >> 8 == 0xae) { + /* + * Besides external call, other SIGP orders also cause a + * 108 (pv notify) intercept. In contrast to external call, + * these orders need to be emulated and hence the appropriate + * place to handle them is in handle_instruction(). + * So first try kvm_s390_handle_sigp_pei() and if that isn't + * successful, go on with handle_instruction(). + */ + ret = kvm_s390_handle_sigp_pei(vcpu); + if (!ret) + return ret; + } return handle_instruction(vcpu); } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index af96dc0549a4..b9c944b262c7 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -28,9 +28,11 @@ #include <asm/switch_to.h> #include <asm/nmi.h> #include <asm/airq.h> +#include <asm/tpi.h> #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" +#include "pci.h" #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 @@ -702,7 +704,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) /* * We indicate floating repressible conditions along with * other pending conditions. Channel Report Pending and Channel - * Subsystem damage are the only two and and are indicated by + * Subsystem damage are the only two and are indicated by * bits in mcic and masked in cr14. */ if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) { @@ -3311,10 +3313,87 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); -static void gib_alert_irq_handler(struct airq_struct *airq, bool floating) +static void aen_host_forward(unsigned long si) { + struct kvm_s390_gisa_interrupt *gi; + struct zpci_gaite *gaite; + struct kvm *kvm; + + gaite = (struct zpci_gaite *)aift->gait + + (si * sizeof(struct zpci_gaite)); + if (gaite->count == 0) + return; + if (gaite->aisb != 0) + set_bit_inv(gaite->aisbo, (unsigned long *)gaite->aisb); + + kvm = kvm_s390_pci_si_to_kvm(aift, si); + if (!kvm) + return; + gi = &kvm->arch.gisa_int; + + if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) || + !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) { + gisa_set_ipm_gisc(gi->origin, gaite->gisc); + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + kvm->stat.aen_forward++; + } +} + +static void aen_process_gait(u8 isc) +{ + bool found = false, first = true; + union zpci_sic_iib iib = {{0}}; + unsigned long si, flags; + + spin_lock_irqsave(&aift->gait_lock, flags); + + if (!aift->gait) { + spin_unlock_irqrestore(&aift->gait_lock, flags); + return; + } + + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv)); + if (si == -1UL) { + if (first || found) { + /* Re-enable interrupts. */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc, + &iib); + first = found = false; + } else { + /* Interrupts on and all bits processed */ + break; + } + found = false; + si = 0; + /* Scan again after re-enabling interrupts */ + continue; + } + found = true; + aen_host_forward(si); + } + + spin_unlock_irqrestore(&aift->gait_lock, flags); +} + +static void gib_alert_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) +{ + struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info; + inc_irq_stat(IRQIO_GAL); - process_gib_alert_list(); + + if ((info->forward || info->error) && + IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + aen_process_gait(info->isc); + if (info->aism != 0) + process_gib_alert_list(); + } else { + process_gib_alert_list(); + } } static struct airq_struct gib_alert_irq = { @@ -3326,6 +3405,11 @@ void kvm_s390_gib_destroy(void) { if (!gib) return; + if (kvm_s390_pci_interp_allowed() && aift) { + mutex_lock(&aift->aift_lock); + kvm_s390_pci_aen_exit(); + mutex_unlock(&aift->aift_lock); + } chsc_sgib(0); unregister_adapter_interrupt(&gib_alert_irq); free_page((unsigned long)gib); @@ -3363,6 +3447,14 @@ int kvm_s390_gib_init(u8 nisc) goto out_unreg_gal; } + if (kvm_s390_pci_interp_allowed()) { + if (kvm_s390_pci_aen_init(nisc)) { + pr_err("Initializing AEN for PCI failed\n"); + rc = -EIO; + goto out_unreg_gal; + } + } + KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); goto out; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8fcb56141689..edfd4bbd0cba 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -31,6 +31,7 @@ #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/pgtable.h> +#include <linux/mmu_notifier.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> @@ -47,6 +48,7 @@ #include <asm/fpu/api.h> #include "kvm-s390.h" #include "gaccess.h" +#include "pci.h" #define CREATE_TRACE_POINTS #include "trace.h" @@ -63,7 +65,8 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, inject_float_mchk), STATS_DESC_COUNTER(VM, inject_pfault_done), STATS_DESC_COUNTER(VM, inject_service_signal), - STATS_DESC_COUNTER(VM, inject_virtio) + STATS_DESC_COUNTER(VM, inject_virtio), + STATS_DESC_COUNTER(VM, aen_forward) }; const struct kvm_stats_header kvm_vm_stats_header = { @@ -502,6 +505,14 @@ int kvm_arch_init(void *opaque) goto out; } + if (kvm_s390_pci_interp_allowed()) { + rc = kvm_s390_pci_init(); + if (rc) { + pr_err("Unable to allocate AIFT for PCI\n"); + goto out; + } + } + rc = kvm_s390_gib_init(GAL_ISC); if (rc) goto out; @@ -516,6 +527,8 @@ out: void kvm_arch_exit(void) { kvm_s390_gib_destroy(); + if (kvm_s390_pci_interp_allowed()) + kvm_s390_pci_exit(); debug_unregister(kvm_s390_dbf); debug_unregister(kvm_s390_dbf_uv); } @@ -606,6 +619,32 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_PROTECTED: r = is_prot_virt_host(); break; + case KVM_CAP_S390_PROTECTED_DUMP: { + u64 pv_cmds_dump[] = { + BIT_UVC_CMD_DUMP_INIT, + BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE, + BIT_UVC_CMD_DUMP_CPU, + BIT_UVC_CMD_DUMP_COMPLETE, + }; + int i; + + r = is_prot_virt_host(); + + for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) { + if (!test_bit_inv(pv_cmds_dump[i], + (unsigned long *)&uv_info.inst_calls_list)) { + r = 0; + break; + } + } + break; + } + case KVM_CAP_S390_ZPCI_OP: + r = kvm_s390_pci_interp_allowed(); + break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = test_facility(11); + break; default: r = 0; } @@ -817,6 +856,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) icpt_operexc_on_all_vcpus(kvm); r = 0; break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (test_facility(11)) { + set_kvm_facility(kvm->arch.model.fac_mask, 11); + set_kvm_facility(kvm->arch.model.fac_list, 11); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", + r ? "(not available)" : "(success)"); + break; default: r = -EINVAL; break; @@ -1019,6 +1072,42 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) return 0; } +static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu) +{ + /* Only set the ECB bits after guest requests zPCI interpretation */ + if (!vcpu->kvm->arch.use_zpci_interp) + return; + + vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI; + vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI; +} + +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->lock); + + if (!kvm_s390_pci_interp_allowed()) + return; + + /* + * If host is configured for PCI and the necessary facilities are + * available, turn on interpretation for the life of this guest + */ + kvm->arch.use_zpci_interp = 1; + + kvm_s390_vcpu_block_all(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_vcpu_pci_setup(vcpu); + kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu); + } + + kvm_s390_vcpu_unblock_all(kvm); +} + static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) { unsigned long cx; @@ -1691,6 +1780,57 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } +/** + * kvm_s390_update_topology_change_report - update CPU topology change report + * @kvm: guest KVM description + * @val: set or clear the MTCR bit + * + * Updates the Multiprocessor Topology-Change-Report bit to signal + * the guest with a topology change. + * This is only relevant if the topology facility is present. + * + * The SCA version, bsca or esca, doesn't matter as offset is the same. + */ +static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) +{ + union sca_utility new, old; + struct bsca_block *sca; + + read_lock(&kvm->arch.sca_lock); + sca = kvm->arch.sca; + do { + old = READ_ONCE(sca->utility); + new = old; + new.mtcr = val; + } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val); + read_unlock(&kvm->arch.sca_lock); +} + +static int kvm_s390_set_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + kvm_s390_update_topology_change_report(kvm, !!attr->attr); + return 0; +} + +static int kvm_s390_get_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u8 topo; + + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + read_lock(&kvm->arch.sca_lock); + topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; + read_unlock(&kvm->arch.sca_lock); + + return put_user(topo, (u8 __user *)attr->addr); +} + static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; @@ -1711,6 +1851,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_set_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_set_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1736,6 +1879,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_get_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_get_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1809,6 +1955,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = 0; break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO; + break; default: ret = -ENXIO; break; @@ -2166,12 +2315,25 @@ out: return r; } -static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) +/** + * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to + * non protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Does not stop in case of error, tries to convert as many + * CPUs as possible. In case of error, the RC and RRC of the last error are + * returned. + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc) { struct kvm_vcpu *vcpu; - u16 rc, rrc; - int ret = 0; unsigned long i; + u16 _rc, _rrc; + int ret = 0; /* * We ignore failures and try to destroy as many CPUs as possible. @@ -2183,9 +2345,9 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) */ kvm_for_each_vcpu(i, vcpu, kvm) { mutex_lock(&vcpu->mutex); - if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) { - *rcp = rc; - *rrcp = rrc; + if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) { + *rc = _rc; + *rrc = _rrc; ret = -EIO; } mutex_unlock(&vcpu->mutex); @@ -2196,6 +2358,17 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) return ret; } +/** + * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM + * to protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Tries to undo the conversion in case of error. + * + * Return: 0 in case of success, otherwise -EIO + */ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) { unsigned long i; @@ -2220,6 +2393,115 @@ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) return r; } +/* + * Here we provide user space with a direct interface to query UV + * related data like UV maxima and available features as well as + * feature specific data. + * + * To facilitate future extension of the data structures we'll try to + * write data up to the maximum requested length. + */ +static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info) +{ + ssize_t len_min; + + switch (info->header.id) { + case KVM_PV_INFO_VM: { + len_min = sizeof(info->header) + sizeof(info->vm); + + if (info->header.len_max < len_min) + return -EINVAL; + + memcpy(info->vm.inst_calls_list, + uv_info.inst_calls_list, + sizeof(uv_info.inst_calls_list)); + + /* It's max cpuid not max cpus, so it's off by one */ + info->vm.max_cpus = uv_info.max_guest_cpu_id + 1; + info->vm.max_guests = uv_info.max_num_sec_conf; + info->vm.max_guest_addr = uv_info.max_sec_stor_addr; + info->vm.feature_indication = uv_info.uv_feature_indications; + + return len_min; + } + case KVM_PV_INFO_DUMP: { + len_min = sizeof(info->header) + sizeof(info->dump); + + if (info->header.len_max < len_min) + return -EINVAL; + + info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len; + info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len; + info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len; + return len_min; + } + default: + return -EINVAL; + } +} + +static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd, + struct kvm_s390_pv_dmp dmp) +{ + int r = -EINVAL; + void __user *result_buff = (void __user *)dmp.buff_addr; + + switch (dmp.subcmd) { + case KVM_PV_DUMP_INIT: { + if (kvm->arch.pv.dumping) + break; + + /* + * Block SIE entry as concurrent dump UVCs could lead + * to validities. + */ + kvm_s390_vcpu_block_all(kvm); + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x", + cmd->rc, cmd->rrc); + if (!r) { + kvm->arch.pv.dumping = true; + } else { + kvm_s390_vcpu_unblock_all(kvm); + r = -EINVAL; + } + break; + } + case KVM_PV_DUMP_CONFIG_STOR_STATE: { + if (!kvm->arch.pv.dumping) + break; + + /* + * gaddr is an output parameter since we might stop + * early. As dmp will be copied back in our caller, we + * don't need to do it ourselves. + */ + r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_DUMP_COMPLETE: { + if (!kvm->arch.pv.dumping) + break; + + r = -EINVAL; + if (dmp.buff_len < uv_info.conf_dump_finalize_len) + break; + + r = kvm_s390_pv_dump_complete(kvm, result_buff, + &cmd->rc, &cmd->rrc); + break; + } + default: + r = -ENOTTY; + break; + } + + return r; +} + static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) { int r = 0; @@ -2356,6 +2638,68 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) cmd->rc, cmd->rrc); break; } + case KVM_PV_INFO: { + struct kvm_s390_pv_info info = {}; + ssize_t data_len; + + /* + * No need to check the VM protection here. + * + * Maybe user space wants to query some of the data + * when the VM is still unprotected. If we see the + * need to fence a new data command we can still + * return an error in the info handler. + */ + + r = -EFAULT; + if (copy_from_user(&info, argp, sizeof(info.header))) + break; + + r = -EINVAL; + if (info.header.len_max < sizeof(info.header)) + break; + + data_len = kvm_s390_handle_pv_info(&info); + if (data_len < 0) { + r = data_len; + break; + } + /* + * If a data command struct is extended (multiple + * times) this can be used to determine how much of it + * is valid. + */ + info.header.len_written = data_len; + + r = -EFAULT; + if (copy_to_user(argp, &info, data_len)) + break; + + r = 0; + break; + } + case KVM_PV_DUMP: { + struct kvm_s390_pv_dmp dmp; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&dmp, argp, sizeof(dmp))) + break; + + r = kvm_s390_pv_dmp(kvm, cmd, dmp); + if (r) + break; + + if (copy_to_user(argp, &dmp, sizeof(dmp))) { + r = -EFAULT; + break; + } + + break; + } default: r = -ENOTTY; } @@ -2581,6 +2925,19 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; break; } + case KVM_S390_ZPCI_OP: { + struct kvm_s390_zpci_op args; + + r = -EINVAL; + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + break; + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + r = kvm_s390_pci_zpci_op(kvm, &args); + break; + } default: r = -ENOTTY; } @@ -2742,6 +3099,14 @@ static void sca_dispose(struct kvm *kvm) kvm->arch.sca = NULL; } +void kvm_arch_free_vm(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_clear_list(kvm); + + __kvm_arch_free_vm(kvm); +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; @@ -2824,6 +3189,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_crypto_init(kvm); + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + mutex_lock(&kvm->lock); + kvm_s390_pci_init_list(kvm); + kvm_s390_vcpu_pci_enable_interp(kvm); + mutex_unlock(&kvm->lock); + } + mutex_init(&kvm->arch.float_int.ais_lock); spin_lock_init(&kvm->arch.float_int.lock); for (i = 0; i < FIRQ_LIST_COUNT; i++) @@ -2877,6 +3249,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_clear_async_pf_completion_queue(vcpu); if (!kvm_is_ucontrol(vcpu->kvm)) sca_del_vcpu(vcpu); + kvm_s390_update_topology_change_report(vcpu->kvm, 1); if (kvm_is_ucontrol(vcpu->kvm)) gmap_remove(vcpu->arch.gmap); @@ -2904,6 +3277,15 @@ void kvm_arch_destroy_vm(struct kvm *kvm) */ if (kvm_s390_pv_get_handle(kvm)) kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); + /* + * Remove the mmu notifier only when the whole KVM VM is torn down, + * and only if one was registered to begin with. If the VM is + * currently not protected, but has been previously been protected, + * then it's possible that the notifier is still registered. + */ + if (kvm->arch.pv.mmu_notifier.ops) + mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm); + debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) @@ -3047,9 +3429,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) if (!sclp.has_esca || !sclp.has_64bscao) return false; - mutex_lock(&kvm->lock); rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); - mutex_unlock(&kvm->lock); return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; } @@ -3272,6 +3652,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT; if (test_kvm_facility(vcpu->kvm, 9)) vcpu->arch.sie_block->ecb |= ECB_SRSI; + if (test_kvm_facility(vcpu->kvm, 11)) + vcpu->arch.sie_block->ecb |= ECB_PTF; if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= ECB_TE; if (!kvm_is_ucontrol(vcpu->kvm)) @@ -3324,6 +3706,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_crypto_setup(vcpu); + kvm_s390_vcpu_pci_setup(vcpu); + mutex_lock(&vcpu->kvm->lock); if (kvm_s390_pv_is_protected(vcpu->kvm)) { rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc); @@ -3403,6 +3787,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) rc = kvm_s390_vcpu_setup(vcpu); if (rc) goto out_ucontrol_uninit; + + kvm_s390_update_topology_change_report(vcpu->kvm, 1); return 0; out_ucontrol_uninit: @@ -4473,6 +4859,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; int rc; + /* + * Running a VM while dumping always has the potential to + * produce inconsistent dump data. But for PV vcpus a SIE + * entry while dumping could also lead to a fatal validity + * intercept which we absolutely want to avoid. + */ + if (vcpu->kvm->arch.pv.dumping) + return -EINVAL; + if (kvm_run->immediate_exit) return -EINTR; @@ -4912,6 +5307,48 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, return -ENOIOCTLCMD; } +static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu, + struct kvm_pv_cmd *cmd) +{ + struct kvm_s390_pv_dmp dmp; + void *data; + int ret; + + /* Dump initialization is a prerequisite */ + if (!vcpu->kvm->arch.pv.dumping) + return -EINVAL; + + if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp))) + return -EFAULT; + + /* We only handle this subcmd right now */ + if (dmp.subcmd != KVM_PV_DUMP_CPU) + return -EINVAL; + + /* CPU dump length is the same as create cpu storage donation. */ + if (dmp.buff_len != uv_info.guest_cpu_stor_len) + return -EINVAL; + + data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc); + + VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x", + vcpu->vcpu_id, cmd->rc, cmd->rrc); + + if (ret) + ret = -EINVAL; + + /* On success copy over the dump data */ + if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len)) + ret = -EFAULT; + + kvfree(data); + return ret; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5076,6 +5513,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, irq_state.len); break; } + case KVM_S390_PV_CPU_COMMAND: { + struct kvm_pv_cmd cmd; + + r = -EINVAL; + if (!is_prot_virt_host()) + break; + + r = -EFAULT; + if (copy_from_user(&cmd, argp, sizeof(cmd))) + break; + + r = -EINVAL; + if (cmd.flags) + break; + + /* We only handle this cmd right now */ + if (cmd.cmd != KVM_PV_DUMP) + break; + + r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd); + + /* Always copy over UV rc / rrc data */ + if (copy_to_user((__u8 __user *)argp, &cmd.rc, + sizeof(cmd.rc) + sizeof(cmd.rrc))) + r = -EFAULT; + break; + } default: r = -ENOTTY; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 497d52a83c78..f6fd668f887e 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -250,6 +250,11 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, unsigned long tweak, u16 *rc, u16 *rrc); int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state); +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc); static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) { @@ -374,6 +379,7 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); @@ -508,6 +514,16 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm); /** + * kvm_s390_vcpu_pci_enable_interp + * + * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store + * interpretation as well as adapter interruption forwarding. + * + * @kvm: the KVM guest + */ +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm); + +/** * diag9c_forwarding_hz * * Set the maximum number of diag9c forwarding per second diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c new file mode 100644 index 000000000000..4946fb7757d6 --- /dev/null +++ b/arch/s390/kvm/pci.c @@ -0,0 +1,690 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <asm/pci.h> +#include <asm/pci_insn.h> +#include <asm/pci_io.h> +#include <asm/sclp.h> +#include "pci.h" +#include "kvm-s390.h" + +struct zpci_aift *aift; + +static inline int __set_irq_noiib(u16 ctl, u8 isc) +{ + union zpci_sic_iib iib = {{0}}; + + return zpci_set_irq_ctrl(ctl, isc, &iib); +} + +void kvm_s390_pci_aen_exit(void) +{ + unsigned long flags; + struct kvm_zdev **gait_kzdev; + + lockdep_assert_held(&aift->aift_lock); + + /* + * Contents of the aipb remain registered for the life of the host + * kernel, the information preserved in zpci_aipb and zpci_aif_sbv + * in case we insert the KVM module again later. Clear the AIFT + * information and free anything not registered with underlying + * firmware. + */ + spin_lock_irqsave(&aift->gait_lock, flags); + gait_kzdev = aift->kzdev; + aift->gait = NULL; + aift->sbv = NULL; + aift->kzdev = NULL; + spin_unlock_irqrestore(&aift->gait_lock, flags); + + kfree(gait_kzdev); +} + +static int zpci_setup_aipb(u8 nisc) +{ + struct page *page; + int size, rc; + + zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL); + if (!zpci_aipb) + return -ENOMEM; + + aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, 0); + if (!aift->sbv) { + rc = -ENOMEM; + goto free_aipb; + } + zpci_aif_sbv = aift->sbv; + size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES * + sizeof(struct zpci_gaite))); + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size); + if (!page) { + rc = -ENOMEM; + goto free_sbv; + } + aift->gait = (struct zpci_gaite *)page_to_phys(page); + + zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector); + zpci_aipb->aipb.gait = virt_to_phys(aift->gait); + zpci_aipb->aipb.afi = nisc; + zpci_aipb->aipb.faal = ZPCI_NR_DEVICES; + + /* Setup Adapter Event Notification Interpretation */ + if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) { + rc = -EIO; + goto free_gait; + } + + return 0; + +free_gait: + free_pages((unsigned long)aift->gait, size); +free_sbv: + airq_iv_release(aift->sbv); + zpci_aif_sbv = NULL; +free_aipb: + kfree(zpci_aipb); + zpci_aipb = NULL; + + return rc; +} + +static int zpci_reset_aipb(u8 nisc) +{ + /* + * AEN registration can only happen once per system boot. If + * an aipb already exists then AEN was already registered and + * we can re-use the aipb contents. This can only happen if + * the KVM module was removed and re-inserted. However, we must + * ensure that the same forwarding ISC is used as this is assigned + * during KVM module load. + */ + if (zpci_aipb->aipb.afi != nisc) + return -EINVAL; + + aift->sbv = zpci_aif_sbv; + aift->gait = (struct zpci_gaite *)zpci_aipb->aipb.gait; + + return 0; +} + +int kvm_s390_pci_aen_init(u8 nisc) +{ + int rc = 0; + + /* If already enabled for AEN, bail out now */ + if (aift->gait || aift->sbv) + return -EPERM; + + mutex_lock(&aift->aift_lock); + aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev), + GFP_KERNEL); + if (!aift->kzdev) { + rc = -ENOMEM; + goto unlock; + } + + if (!zpci_aipb) + rc = zpci_setup_aipb(nisc); + else + rc = zpci_reset_aipb(nisc); + if (rc) + goto free_zdev; + + /* Enable floating IRQs */ + if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) { + rc = -EIO; + kvm_s390_pci_aen_exit(); + } + + goto unlock; + +free_zdev: + kfree(aift->kzdev); +unlock: + mutex_unlock(&aift->aift_lock); + return rc; +} + +/* Modify PCI: Register floating adapter interruption forwarding */ +static int kvm_zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {}; + u8 status; + + fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); + fib.fmt0.aibvo = 0; + fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister floating adapter interruption forwarding */ +static int kvm_zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {}; + u8 cc, status; + + fib.gd = zdev->gisa; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static inline void unaccount_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + + if (user) + atomic_long_sub(nr_pages, &user->locked_vm); + if (current->mm) + atomic64_sub(nr_pages, ¤t->mm->pinned_vm); +} + +static inline int account_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + unsigned long page_limit, cur_pages, new_pages; + + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + atomic64_add(nr_pages, ¤t->mm->pinned_vm); + + return 0; +} + +static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, + bool assist) +{ + struct page *pages[1], *aibv_page, *aisb_page = NULL; + unsigned int msi_vecs, idx; + struct zpci_gaite *gaite; + unsigned long hva, bit; + struct kvm *kvm; + phys_addr_t gaddr; + int rc = 0, gisc, npages, pcount = 0; + + /* + * Interrupt forwarding is only applicable if the device is already + * enabled for interpretation + */ + if (zdev->gisa == 0) + return -EINVAL; + + kvm = zdev->kzdev->kvm; + msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi); + + /* Get the associated forwarding ISC - if invalid, return the error */ + gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc); + if (gisc < 0) + return gisc; + + /* Replace AIBV address */ + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto out; + } + aibv_page = pages[0]; + pcount++; + gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK); + fib->fmt0.aibv = gaddr; + + /* Pin the guest AISB if one was specified */ + if (fib->fmt0.sum == 1) { + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, + pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto unpin1; + } + aisb_page = pages[0]; + pcount++; + } + + /* Account for pinned pages, roll back on failure */ + if (account_mem(pcount)) + goto unpin2; + + /* AISB must be allocated before we can fill in GAITE */ + mutex_lock(&aift->aift_lock); + bit = airq_iv_alloc_bit(aift->sbv); + if (bit == -1UL) + goto unlock; + zdev->aisb = bit; /* store the summary bit number */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | + AIRQ_IV_BITLOCK | + AIRQ_IV_GUESTVEC, + phys_to_virt(fib->fmt0.aibv)); + + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + + /* If assist not requested, host will get all alerts */ + if (assist) + gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + else + gaite->gisa = 0; + + gaite->gisc = fib->fmt0.isc; + gaite->count++; + gaite->aisbo = fib->fmt0.aisbo; + gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb & + ~PAGE_MASK)); + aift->kzdev[zdev->aisb] = zdev->kzdev; + spin_unlock_irq(&aift->gait_lock); + + /* Update guest FIB for re-issue */ + fib->fmt0.aisbo = zdev->aisb & 63; + fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib->fmt0.isc = gisc; + + /* Save some guest fib values in the host for later use */ + zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc; + zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv; + mutex_unlock(&aift->aift_lock); + + /* Issue the clp to setup the irq now */ + rc = kvm_zpci_set_airq(zdev); + return rc; + +unlock: + mutex_unlock(&aift->aift_lock); +unpin2: + if (fib->fmt0.sum == 1) + unpin_user_page(aisb_page); +unpin1: + unpin_user_page(aibv_page); +out: + return rc; +} + +static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force) +{ + struct kvm_zdev *kzdev = zdev->kzdev; + struct zpci_gaite *gaite; + struct page *vpage = NULL, *spage = NULL; + int rc, pcount = 0; + u8 isc; + + if (zdev->gisa == 0) + return -EINVAL; + + mutex_lock(&aift->aift_lock); + + /* + * If the clear fails due to an error, leave now unless we know this + * device is about to go away (force) -- In that case clear the GAITE + * regardless. + */ + rc = kvm_zpci_clear_airq(zdev); + if (rc && !force) + goto out; + + if (zdev->kzdev->fib.fmt0.aibv == 0) + goto out; + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + isc = gaite->gisc; + gaite->count--; + if (gaite->count == 0) { + /* Release guest AIBV and AISB */ + vpage = phys_to_page(kzdev->fib.fmt0.aibv); + if (gaite->aisb != 0) + spage = phys_to_page(gaite->aisb); + /* Clear the GAIT entry */ + gaite->aisb = 0; + gaite->gisc = 0; + gaite->aisbo = 0; + gaite->gisa = 0; + aift->kzdev[zdev->aisb] = 0; + /* Clear zdev info */ + airq_iv_free_bit(aift->sbv, zdev->aisb); + airq_iv_release(zdev->aibv); + zdev->aisb = 0; + zdev->aibv = NULL; + } + spin_unlock_irq(&aift->gait_lock); + kvm_s390_gisc_unregister(kzdev->kvm, isc); + kzdev->fib.fmt0.isc = 0; + kzdev->fib.fmt0.aibv = 0; + + if (vpage) { + unpin_user_page(vpage); + pcount++; + } + if (spage) { + unpin_user_page(spage); + pcount++; + } + if (pcount > 0) + unaccount_mem(pcount); +out: + mutex_unlock(&aift->aift_lock); + + return rc; +} + +static int kvm_s390_pci_dev_open(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL); + if (!kzdev) + return -ENOMEM; + + kzdev->zdev = zdev; + zdev->kzdev = kzdev; + + return 0; +} + +static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = zdev->kzdev; + WARN_ON(kzdev->zdev != zdev); + zdev->kzdev = NULL; + kfree(kzdev); +} + + +/* + * Register device with the specified KVM. If interpetation facilities are + * available, enable them and let userspace indicate whether or not they will + * be used (specify SHM bit to disable). + */ +int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm) +{ + int rc; + + if (!zdev) + return -EINVAL; + + mutex_lock(&zdev->kzdev_lock); + + if (zdev->kzdev || zdev->gisa != 0 || !kvm) { + mutex_unlock(&zdev->kzdev_lock); + return -EINVAL; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + + rc = kvm_s390_pci_dev_open(zdev); + if (rc) + goto err; + + /* + * If interpretation facilities aren't available, add the device to + * the kzdev list but don't enable for interpretation. + */ + if (!kvm_s390_pci_interp_allowed()) + goto out; + + /* + * If this is the first request to use an interpreted device, make the + * necessary vcpu changes + */ + if (!kvm->arch.use_zpci_interp) + kvm_s390_vcpu_pci_enable_interp(kvm); + + if (zdev_enabled(zdev)) { + rc = zpci_disable_device(zdev); + if (rc) + goto err; + } + + /* + * Store information about the identity of the kvm guest allowed to + * access this device via interpretation to be used by host CLP + */ + zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + + rc = zpci_enable_device(zdev); + if (rc) + goto clear_gisa; + + /* Re-register the IOMMU that was already created */ + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table)); + if (rc) + goto clear_gisa; + +out: + zdev->kzdev->kvm = kvm; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list); + spin_unlock(&kvm->arch.kzdev_list_lock); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return 0; + +clear_gisa: + zdev->gisa = 0; +err: + if (zdev->kzdev) + kvm_s390_pci_dev_release(zdev); + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + kvm_put_kvm(kvm); + return rc; +} +EXPORT_SYMBOL_GPL(kvm_s390_pci_register_kvm); + +void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev) +{ + struct kvm *kvm; + + if (!zdev) + return; + + mutex_lock(&zdev->kzdev_lock); + + if (WARN_ON(!zdev->kzdev)) { + mutex_unlock(&zdev->kzdev_lock); + return; + } + + kvm = zdev->kzdev->kvm; + mutex_lock(&kvm->lock); + + /* + * A 0 gisa means interpretation was never enabled, just remove the + * device from the list. + */ + if (zdev->gisa == 0) + goto out; + + /* Forwarding must be turned off before interpretation */ + if (zdev->kzdev->fib.fmt0.aibv != 0) + kvm_s390_pci_aif_disable(zdev, true); + + /* Remove the host CLP guest designation */ + zdev->gisa = 0; + + if (zdev_enabled(zdev)) { + if (zpci_disable_device(zdev)) + goto out; + } + + if (zpci_enable_device(zdev)) + goto out; + + /* Re-register the IOMMU that was already created */ + zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table)); + +out: + spin_lock(&kvm->arch.kzdev_list_lock); + list_del(&zdev->kzdev->entry); + spin_unlock(&kvm->arch.kzdev_list_lock); + kvm_s390_pci_dev_release(zdev); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + + kvm_put_kvm(kvm); +} +EXPORT_SYMBOL_GPL(kvm_s390_pci_unregister_kvm); + +void kvm_s390_pci_init_list(struct kvm *kvm) +{ + spin_lock_init(&kvm->arch.kzdev_list_lock); + INIT_LIST_HEAD(&kvm->arch.kzdev_list); +} + +void kvm_s390_pci_clear_list(struct kvm *kvm) +{ + /* + * This list should already be empty, either via vfio device closures + * or kvm fd cleanup. + */ + spin_lock(&kvm->arch.kzdev_list_lock); + WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list)); + spin_unlock(&kvm->arch.kzdev_list_lock); +} + +static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh) +{ + struct zpci_dev *zdev = NULL; + struct kvm_zdev *kzdev; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) { + if (kzdev->zdev->fh == fh) { + zdev = kzdev->zdev; + break; + } + } + spin_unlock(&kvm->arch.kzdev_list_lock); + + return zdev; +} + +static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev, + struct kvm_s390_zpci_op *args) +{ + struct zpci_fib fib = {}; + bool hostflag; + + fib.fmt0.aibv = args->u.reg_aen.ibv; + fib.fmt0.isc = args->u.reg_aen.isc; + fib.fmt0.noi = args->u.reg_aen.noi; + if (args->u.reg_aen.sb != 0) { + fib.fmt0.aisb = args->u.reg_aen.sb; + fib.fmt0.aisbo = args->u.reg_aen.sbo; + fib.fmt0.sum = 1; + } else { + fib.fmt0.aisb = 0; + fib.fmt0.aisbo = 0; + fib.fmt0.sum = 0; + } + + hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST); + return kvm_s390_pci_aif_enable(zdev, &fib, hostflag); +} + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args) +{ + struct kvm_zdev *kzdev; + struct zpci_dev *zdev; + int r; + + zdev = get_zdev_from_kvm_by_fh(kvm, args->fh); + if (!zdev) + return -ENODEV; + + mutex_lock(&zdev->kzdev_lock); + mutex_lock(&kvm->lock); + + kzdev = zdev->kzdev; + if (!kzdev) { + r = -ENODEV; + goto out; + } + if (kzdev->kvm != kvm) { + r = -EPERM; + goto out; + } + + switch (args->op) { + case KVM_S390_ZPCIOP_REG_AEN: + /* Fail on unknown flags */ + if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) { + r = -EINVAL; + break; + } + r = kvm_s390_pci_zpci_reg_aen(zdev, args); + break; + case KVM_S390_ZPCIOP_DEREG_AEN: + r = kvm_s390_pci_aif_disable(zdev, false); + break; + default: + r = -EINVAL; + } + +out: + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return r; +} + +int kvm_s390_pci_init(void) +{ + aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL); + if (!aift) + return -ENOMEM; + + spin_lock_init(&aift->gait_lock); + mutex_init(&aift->aift_lock); + + return 0; +} + +void kvm_s390_pci_exit(void) +{ + mutex_destroy(&aift->aift_lock); + + kfree(aift); +} diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h new file mode 100644 index 000000000000..3a3606c3a0fe --- /dev/null +++ b/arch/s390/kvm/pci.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#ifndef __KVM_S390_PCI_H +#define __KVM_S390_PCI_H + +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <asm/airq.h> +#include <asm/cpu.h> + +struct kvm_zdev { + struct zpci_dev *zdev; + struct kvm *kvm; + struct zpci_fib fib; + struct list_head entry; +}; + +struct zpci_gaite { + u32 gisa; + u8 gisc; + u8 count; + u8 reserved; + u8 aisbo; + u64 aisb; +}; + +struct zpci_aift { + struct zpci_gaite *gait; + struct airq_iv *sbv; + struct kvm_zdev **kzdev; + spinlock_t gait_lock; /* Protects the gait, used during AEN forward */ + struct mutex aift_lock; /* Protects the other structures in aift */ +}; + +extern struct zpci_aift *aift; + +static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift, + unsigned long si) +{ + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || aift->kzdev == 0 || + aift->kzdev[si] == 0) + return 0; + return aift->kzdev[si]->kvm; +}; + +int kvm_s390_pci_aen_init(u8 nisc); +void kvm_s390_pci_aen_exit(void); + +void kvm_s390_pci_init_list(struct kvm *kvm); +void kvm_s390_pci_clear_list(struct kvm *kvm); + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args); + +int kvm_s390_pci_init(void); +void kvm_s390_pci_exit(void); + +static inline bool kvm_s390_pci_interp_allowed(void) +{ + struct cpuid cpu_id; + + get_cpu_id(&cpu_id); + switch (cpu_id.machine) { + case 0x2817: + case 0x2818: + case 0x2827: + case 0x2828: + case 0x2964: + case 0x2965: + /* No SHM on certain machines */ + return false; + default: + return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) && + sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi && + sclp.has_aisii); + } +} + +#endif /* __KVM_S390_PCI_H */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 83bb5cf97282..3335fa09b6f1 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -442,7 +442,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu) vcpu->stat.instruction_ipte_interlock++; if (psw_bits(vcpu->arch.sie_block->gpsw).pstate) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu)); + wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm)); kvm_s390_retry_instr(vcpu); VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation"); return 0; @@ -873,10 +873,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - if (fc > 3) { - kvm_s390_set_psw_cc(vcpu, 3); - return 0; - } + /* Bailout forbidden function codes */ + if (fc > 3 && fc != 15) + goto out_no_data; + + /* + * fc 15 is provided only with + * - PTF/CPU topology support through facility 15 + * - KVM_CAP_S390_USER_STSI + */ + if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) || + !vcpu->kvm->arch.user_stsi)) + goto out_no_data; if (vcpu->run->s.regs.gprs[0] & 0x0fffff00 || vcpu->run->s.regs.gprs[1] & 0xffff0000) @@ -910,6 +918,10 @@ static int handle_stsi(struct kvm_vcpu *vcpu) goto out_no_data; handle_stsi_3_2_2(vcpu, (void *) mem); break; + case 15: /* fc 15 is fully handled in userspace */ + insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2); + trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); + return -EREMOTE; } if (kvm_s390_pv_cpu_is_protected(vcpu)) { memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem, @@ -1471,7 +1483,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu) access_key = (operand2 & 0xf0) >> 4; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, GACC_STORE, access_key); @@ -1508,7 +1520,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu) } if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); return ret; } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index cc7c9599f43e..7cb7799a0acb 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -7,13 +7,25 @@ */ #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/minmax.h> #include <linux/pagemap.h> #include <linux/sched/signal.h> #include <asm/gmap.h> #include <asm/uv.h> #include <asm/mman.h> +#include <linux/pagewalk.h> +#include <linux/sched/mm.h> +#include <linux/mmu_notifier.h> #include "kvm-s390.h" +static void kvm_s390_clear_pv_state(struct kvm *kvm) +{ + kvm->arch.pv.handle = 0; + kvm->arch.pv.guest_len = 0; + kvm->arch.pv.stor_base = 0; + kvm->arch.pv.stor_var = NULL; +} + int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) { int cc; @@ -108,7 +120,7 @@ static void kvm_s390_pv_dealloc_vm(struct kvm *kvm) vfree(kvm->arch.pv.stor_var); free_pages(kvm->arch.pv.stor_base, get_order(uv_info.guest_base_stor_len)); - memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv)); + kvm_s390_clear_pv_state(kvm); } static int kvm_s390_pv_alloc_vm(struct kvm *kvm) @@ -152,21 +164,51 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) { int cc; - /* make all pages accessible before destroying the guest */ - s390_reset_acc(kvm->mm); - cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), UVC_CMD_DESTROY_SEC_CONF, rc, rrc); WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); - atomic_set(&kvm->mm->context.is_protected, 0); + /* + * if the mm still has a mapping, make all its pages accessible + * before destroying the guest + */ + if (mmget_not_zero(kvm->mm)) { + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + mmput(kvm->mm); + } + + if (!cc) { + atomic_dec(&kvm->mm->context.protected_count); + kvm_s390_pv_dealloc_vm(kvm); + } else { + /* Intended memory leak on "impossible" error */ + s390_replace_asce(kvm->arch.gmap); + } KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc); WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc); - /* Inteded memory leak on "impossible" error */ - if (!cc) - kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; } +static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, + struct mm_struct *mm) +{ + struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier); + u16 dummy; + + /* + * No locking is needed since this is the last thread of the last user of this + * struct mm. + * When the struct kvm gets deinitialized, this notifier is also + * unregistered. This means that if this notifier runs, then the + * struct kvm is still valid. + */ + kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); +} + +static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { + .release = kvm_s390_pv_mmu_notifier_release, +}; + int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) { struct uv_cb_cgc uvcb = { @@ -197,14 +239,22 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) /* Outputs */ kvm->arch.pv.handle = uvcb.guest_handle; + atomic_inc(&kvm->mm->context.protected_count); if (cc) { - if (uvcb.header.rc & UVC_RC_NEED_DESTROY) + if (uvcb.header.rc & UVC_RC_NEED_DESTROY) { kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); - else + } else { + atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); + } return -EIO; } kvm->arch.gmap->guest_handle = uvcb.guest_handle; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + } return 0; } @@ -224,8 +274,6 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, *rrc = uvcb.header.rrc; KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x", *rc, *rrc); - if (!cc) - atomic_set(&kvm->mm->context.is_protected, 1); return cc ? -EINVAL : 0; } @@ -298,3 +346,200 @@ int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state) return -EINVAL; return 0; } + +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_cpu uvcb = { + .header.cmd = UVC_CMD_DUMP_CPU, + .header.len = sizeof(uvcb), + .cpu_handle = vcpu->arch.pv.handle, + .dump_area_origin = (u64)buff, + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + return cc; +} + +/* Size of the cache for the storage state dump data. 1MB for now */ +#define DUMP_BUFF_LEN HPAGE_SIZE + +/** + * kvm_s390_pv_dump_stor_state + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @gaddr: Starting absolute guest address for which the storage state + * is requested. + * @buff_user_len: Length of the buff_user buffer + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Stores buff_len bytes of tweak component values to buff_user + * starting with the 1MB block specified by the absolute guest address + * (gaddr). The gaddr pointer will be updated with the last address + * for which data was written when returning to userspace. buff_user + * might be written to even if an error rc is returned. For instance + * if we encounter a fault after writing the first page of data. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the cache fails + * -EINVAL if gaddr is not aligned to 1MB + * -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_stor_state uvcb = { + .header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE, + .header.len = sizeof(uvcb), + .config_handle = kvm->arch.pv.handle, + .gaddr = *gaddr, + .dump_area_origin = 0, + }; + const u64 increment_len = uv_info.conf_dump_storage_state_len; + size_t buff_kvm_size; + size_t size_done = 0; + u8 *buff_kvm = NULL; + int cc, ret; + + ret = -EINVAL; + /* UV call processes 1MB guest storage chunks at a time */ + if (!IS_ALIGNED(*gaddr, HPAGE_SIZE)) + goto out; + + /* + * We provide the storage state for 1MB chunks of guest + * storage. The buffer will need to be aligned to + * conf_dump_storage_state_len so we don't end on a partial + * chunk. + */ + if (!buff_user_len || + !IS_ALIGNED(buff_user_len, increment_len)) + goto out; + + /* + * Allocate a buffer from which we will later copy to the user + * process. We don't want userspace to dictate our buffer size + * so we limit it to DUMP_BUFF_LEN. + */ + ret = -ENOMEM; + buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN); + buff_kvm = vzalloc(buff_kvm_size); + if (!buff_kvm) + goto out; + + ret = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + /* We will loop until the user buffer is filled or an error occurs */ + do { + /* Get 1MB worth of guest storage state data */ + cc = uv_call_sched(0, (u64)&uvcb); + + /* All or nothing */ + if (cc) { + ret = -EINVAL; + break; + } + + size_done += increment_len; + uvcb.dump_area_origin += increment_len; + buff_user_len -= increment_len; + uvcb.gaddr += HPAGE_SIZE; + + /* KVM Buffer full, time to copy to the process */ + if (!buff_user_len || size_done == DUMP_BUFF_LEN) { + if (copy_to_user(buff_user, buff_kvm, size_done)) { + ret = -EFAULT; + break; + } + + buff_user += size_done; + size_done = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + } + } while (buff_user_len); + + /* Report back where we ended dumping */ + *gaddr = uvcb.gaddr; + + /* Lets only log errors, we don't want to spam */ +out: + if (ret) + KVM_UV_EVENT(kvm, 3, + "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x", + uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + vfree(buff_kvm); + + return ret; +} + +/** + * kvm_s390_pv_dump_complete + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Completes the dumping operation and writes the completion data to + * user space. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the completion buffer fails + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_complete complete = { + .header.len = sizeof(complete), + .header.cmd = UVC_CMD_DUMP_COMPLETE, + .config_handle = kvm_s390_pv_get_handle(kvm), + }; + u64 *compl_data; + int ret; + + /* Allocate dump area */ + compl_data = vzalloc(uv_info.conf_dump_finalize_len); + if (!compl_data) + return -ENOMEM; + complete.dump_area_origin = (u64)compl_data; + + ret = uv_call_sched(0, (u64)&complete); + *rc = complete.header.rc; + *rrc = complete.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x", + complete.header.rc, complete.header.rrc); + + if (!ret) { + /* + * kvm_s390_pv_dealloc_vm() will also (mem)set + * this to false on a reboot or other destroy + * operation for this vm. + */ + kvm->arch.pv.dumping = false; + kvm_s390_vcpu_unblock_all(kvm); + ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len); + if (ret) + ret = -EFAULT; + } + vfree(compl_data); + /* If the UVC returned an error, translate it to -EINVAL */ + if (ret > 0) + ret = -EINVAL; + return ret; +} diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 8aaee2892ec3..cb747bf6c798 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -480,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu) struct kvm_vcpu *dest_vcpu; u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL); - trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); - if (order_code == SIGP_EXTERNAL_CALL) { + trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); + dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr); BUG_ON(dest_vcpu == NULL); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index dada78b92691..94138f8f0c1c 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -503,6 +503,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* Host-protection-interruption introduced with ESOP */ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT; + /* + * CPU Topology + * This facility only uses the utility field of the SCA and none of + * the cpu entries that are problematic with the other interpretation + * facilities so we can pass it through + */ + if (test_kvm_facility(vcpu->kvm, 11)) + scb_s->ecb |= scb_o->ecb & ECB_PTF; /* transactional execution */ if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) { /* remap the prefix is tx is toggled on */ diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e173b6187ad5..ee7871f770fb 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -754,6 +754,7 @@ void do_secure_storage_access(struct pt_regs *regs) struct vm_area_struct *vma; struct mm_struct *mm; struct page *page; + struct gmap *gmap; int rc; /* @@ -783,6 +784,17 @@ void do_secure_storage_access(struct pt_regs *regs) } switch (get_fault_type(regs)) { + case GMAP_FAULT: + mm = current->mm; + gmap = (struct gmap *)S390_lowcore.gmap; + mmap_read_lock(mm); + addr = __gmap_translate(gmap, addr); + mmap_read_unlock(mm); + if (IS_ERR_VALUE(addr)) { + do_fault_error(regs, VM_ACCESS_FLAGS, VM_FAULT_BADMAP); + break; + } + fallthrough; case USER_FAULT: mm = current->mm; mmap_read_lock(mm); @@ -811,7 +823,6 @@ void do_secure_storage_access(struct pt_regs *regs) if (rc) BUG(); break; - case GMAP_FAULT: default: do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); WARN_ON_ONCE(1); @@ -837,6 +848,16 @@ NOKPROBE_SYMBOL(do_non_secure_storage_access); void do_secure_storage_violation(struct pt_regs *regs) { + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + /* + * If the VM has been rebooted, its address space might still contain + * secure pages from the previous boot. + * Clear the page so it can be reused. + */ + if (!gmap_destroy_page(gmap, gaddr)) + return; /* * Either KVM messed up the secure guest mapping or the same * page is mapped into multiple secure guests. diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index b8ae4a4aa2ba..62758cb5872f 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2697,41 +2697,168 @@ void s390_reset_cmma(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(s390_reset_cmma); +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, +}; + /* - * make inaccessible pages accessible again + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. */ -static int __s390_reset_acc(pte_t *ptep, unsigned long addr, - unsigned long next, struct mm_walk *walk) +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) { - pte_t pte = READ_ONCE(*ptep); + unsigned long i; - /* There is a reference through the mapping */ - if (pte_present(pte)) - WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK)); + for (i = 0; i < count; i++) { + /* we always have an extra reference */ + uv_destroy_owned_page(pfn_to_phys(pfns[i])); + /* get rid of the extra reference */ + put_page(pfn_to_page(pfns[i])); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } return 0; } +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); -static const struct mm_walk_ops reset_acc_walk_ops = { - .pte_entry = __s390_reset_acc, -}; +/** + * s390_unlist_old_asce - Remove the topmost level of page tables from the + * list of page tables of the gmap. + * @gmap: the gmap whose table is to be removed + * + * On s390x, KVM keeps a list of all pages containing the page tables of the + * gmap (the CRST list). This list is used at tear down time to free all + * pages that are now not needed anymore. + * + * This function removes the topmost page of the tree (the one pointed to by + * the ASCE) from the CRST list. + * + * This means that it will not be freed when the VM is torn down, and needs + * to be handled separately by the caller, unless a leak is actually + * intended. Notice that this function will only remove the page from the + * list, the page will still be used as a top level page table (and ASCE). + */ +void s390_unlist_old_asce(struct gmap *gmap) +{ + struct page *old; -#include <linux/sched/mm.h> -void s390_reset_acc(struct mm_struct *mm) + old = virt_to_page(gmap->table); + spin_lock(&gmap->guest_table_lock); + list_del(&old->lru); + /* + * Sometimes the topmost page might need to be "removed" multiple + * times, for example if the VM is rebooted into secure mode several + * times concurrently, or if s390_replace_asce fails after calling + * s390_remove_old_asce and is attempted again later. In that case + * the old asce has been removed from the list, and therefore it + * will not be freed when the VM terminates, but the ASCE is still + * in use and still pointed to. + * A subsequent call to replace_asce will follow the pointer and try + * to remove the same page from the list again. + * Therefore it's necessary that the page of the ASCE has valid + * pointers, so list_del can work (and do nothing) without + * dereferencing stale or invalid pointers. + */ + INIT_LIST_HEAD(&old->lru); + spin_unlock(&gmap->guest_table_lock); +} +EXPORT_SYMBOL_GPL(s390_unlist_old_asce); + +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) { - if (!mm_is_protected(mm)) - return; + unsigned long asce; + struct page *page; + void *table; + + s390_unlist_old_asce(gmap); + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return -ENOMEM; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + /* - * we might be called during - * reset: we walk the pages and clear - * close of all kvm file descriptors: we walk the pages and clear - * exit of process on fd closure: vma already gone, do nothing + * The caller has to deal with the old ASCE, but here we make sure + * the new one is properly added to the CRST list, so that + * it will be freed when the VM is torn down. */ - if (!mmget_not_zero(mm)) - return; - mmap_read_lock(mm); - walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); - mmap_read_unlock(mm); - mmput(mm); + spin_lock(&gmap->guest_table_lock); + list_add(&page->lru, &gmap->crst_list); + spin_unlock(&gmap->guest_table_lock); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; } -EXPORT_SYMBOL_GPL(s390_reset_acc); +EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index bc980fd313d5..73cdc5539384 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -61,6 +61,12 @@ DEFINE_STATIC_KEY_FALSE(have_mio); static struct kmem_cache *zdev_fmb_cache; +/* AEN structures that must be preserved over KVM module re-insertion */ +union zpci_sic_iib *zpci_aipb; +EXPORT_SYMBOL_GPL(zpci_aipb); +struct airq_iv *zpci_aif_sbv; +EXPORT_SYMBOL_GPL(zpci_aif_sbv); + struct zpci_dev *get_zdev_by_fid(u32 fid) { struct zpci_dev *tmp, *zdev = NULL; @@ -120,11 +126,13 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, fib.pba = base; fib.pal = limit; fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc) zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); return cc; } +EXPORT_SYMBOL_GPL(zpci_register_ioat); /* Modify PCI: Unregister I/O address translation parameters */ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) @@ -133,6 +141,8 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) struct zpci_fib fib = {0}; u8 cc, status; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); if (cc) zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); @@ -160,6 +170,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev) atomic64_set(&zdev->unmapped_pages, 0); fib.fmb_addr = virt_to_phys(zdev->fmb); + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc) { kmem_cache_free(zdev_fmb_cache, zdev->fmb); @@ -178,6 +189,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev) if (!zdev->fmb) return -EINVAL; + fib.gd = zdev->gisa; + /* Function measurement is disabled if fmb address is zero */ cc = zpci_mod_fc(req, &fib, &status); if (cc == 3) /* Function already gone. */ @@ -700,6 +713,7 @@ int zpci_enable_device(struct zpci_dev *zdev) zpci_update_fh(zdev, fh); return rc; } +EXPORT_SYMBOL_GPL(zpci_enable_device); int zpci_disable_device(struct zpci_dev *zdev) { @@ -723,6 +737,7 @@ int zpci_disable_device(struct zpci_dev *zdev) } return rc; } +EXPORT_SYMBOL_GPL(zpci_disable_device); /** * zpci_hot_reset_device - perform a reset of the given zPCI function @@ -816,6 +831,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) kref_init(&zdev->kref); mutex_init(&zdev->lock); + mutex_init(&zdev->kzdev_lock); rc = zpci_init_iommu(zdev); if (rc) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 375e0a5120bc..ee367798e388 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -106,6 +106,8 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev, zdev->max_msi = response->noi; zdev->fmb_update = response->mui; zdev->version = response->version; + zdev->maxstbl = response->maxstbl; + zdev->dtsm = response->dtsm; switch (response->version) { case 1: @@ -229,12 +231,16 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma { struct clp_req_rsp_set_pci *rrb; int rc, retries = 100; + u32 gisa = 0; *fh = 0; rrb = clp_alloc_block(GFP_KERNEL); if (!rrb) return -ENOMEM; + if (command != CLP_SET_DISABLE_PCI_FN) + gisa = zdev->gisa; + do { memset(rrb, 0, sizeof(*rrb)); rrb->request.hdr.len = sizeof(rrb->request); @@ -243,6 +249,7 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma rrb->request.fh = zdev->fh; rrb->request.oc = command; rrb->request.ndas = nr_dma_as; + rrb->request.gisa = gisa; rc = clp_req(rrb, CLP_LPS_PCI); if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) { diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 1a822b7799f8..56480be48244 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -92,6 +92,7 @@ u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status) return cc; } +EXPORT_SYMBOL_GPL(zpci_mod_fc); /* Refresh PCI Translations */ static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status) @@ -138,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) } /* Set Interruption Controls */ -int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) { if (!test_facility(72)) return -EIO; @@ -149,6 +150,7 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) return 0; } +EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl); /* PCI Load */ static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status) diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 500cd2dbdf53..a2b42a63a53b 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -11,16 +11,10 @@ #include <asm/isc.h> #include <asm/airq.h> +#include <asm/tpi.h> static enum {FLOATING, DIRECTED} irq_delivery; -#define SIC_IRQ_MODE_ALL 0 -#define SIC_IRQ_MODE_SINGLE 1 -#define SIC_IRQ_MODE_DIRECT 4 -#define SIC_IRQ_MODE_D_ALL 16 -#define SIC_IRQ_MODE_D_SINGLE 17 -#define SIC_IRQ_MODE_SET_CPU 18 - /* * summary bit vector * FLOATING - summary bit per function @@ -49,6 +43,7 @@ static int zpci_set_airq(struct zpci_dev *zdev) fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */ fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8; fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } @@ -60,6 +55,8 @@ static int zpci_clear_airq(struct zpci_dev *zdev) struct zpci_fib fib = {0}; u8 cc, status; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); if (cc == 3 || (cc == 1 && status == 24)) /* Function already gone or IRQs already deregistered. */ @@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev) fib.fmt = 1; fib.fmt1.noi = zdev->msi_nr_irqs; fib.fmt1.dibvo = zdev->msi_first_bit; + fib.gd = zdev->gisa; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } @@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev) u8 cc, status; fib.fmt = 1; + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc == 3 || (cc == 1 && status == 24)) /* Function already gone or IRQs already deregistered. */ @@ -153,6 +152,7 @@ static struct irq_chip zpci_irq_chip = { static void zpci_handle_cpu_local_irq(bool rescan) { struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; + union zpci_sic_iib iib = {{0}}; unsigned long bit; int irqs_on = 0; @@ -164,7 +164,7 @@ static void zpci_handle_cpu_local_irq(bool rescan) /* End of second scan with interrupts on. */ break; /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC)) + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib)) break; bit = 0; continue; @@ -192,6 +192,7 @@ static void zpci_handle_remote_irq(void *data) static void zpci_handle_fallback_irq(void) { struct cpu_irq_data *cpu_data; + union zpci_sic_iib iib = {{0}}; unsigned long cpu; int irqs_on = 0; @@ -202,7 +203,7 @@ static void zpci_handle_fallback_irq(void) /* End of second scan with interrupts on. */ break; /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) break; cpu = 0; continue; @@ -216,8 +217,11 @@ static void zpci_handle_fallback_irq(void) } } -static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) +static void zpci_directed_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { + bool floating = !tpi_info->directed_irq; + if (floating) { inc_irq_stat(IRQIO_PCF); zpci_handle_fallback_irq(); @@ -227,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) } } -static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) +static void zpci_floating_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { + union zpci_sic_iib iib = {{0}}; unsigned long si, ai; struct airq_iv *aibv; int irqs_on = 0; @@ -242,7 +248,7 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) /* End of second scan with interrupts on. */ break; /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) break; si = 0; continue; @@ -291,7 +297,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) zdev->aisb = bit; /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL); if (!zdev->aibv) return -ENOMEM; @@ -402,11 +408,12 @@ static struct airq_struct zpci_airq = { static void __init cpu_enable_directed_irq(void *unused) { union zpci_sic_iib iib = {{0}}; + union zpci_sic_iib ziib = {{0}}; iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector; - __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); - zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC); + zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib); } static int __init zpci_directed_irq_init(void) @@ -414,14 +421,14 @@ static int __init zpci_directed_irq_init(void) union zpci_sic_iib iib = {{0}}; unsigned int cpu; - zpci_sbv = airq_iv_create(num_possible_cpus(), 0); + zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL); if (!zpci_sbv) return -ENOMEM; iib.diib.isc = PCI_ISC; iib.diib.nr_cpus = num_possible_cpus(); iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector); - __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv), GFP_KERNEL); @@ -436,7 +443,7 @@ static int __init zpci_directed_irq_init(void) zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, AIRQ_IV_DATA | AIRQ_IV_CACHELINE | - (!cpu ? AIRQ_IV_ALLOC : 0)); + (!cpu ? AIRQ_IV_ALLOC : 0), NULL); if (!zpci_ibv[cpu]) return -ENOMEM; } @@ -453,7 +460,7 @@ static int __init zpci_floating_irq_init(void) if (!zpci_ibv) return -ENOMEM; - zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); + zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL); if (!zpci_sbv) goto out_free; @@ -466,6 +473,7 @@ out_free: int __init zpci_irq_init(void) { + union zpci_sic_iib iib = {{0}}; int rc; irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING; @@ -497,7 +505,7 @@ int __init zpci_irq_init(void) * Enable floating IRQs (with suppression after one IRQ). When using * directed IRQs this enables the fallback path. */ - zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC); + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib); return 0; out_airq: diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 530dd941d140..cb0aff5c0187 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -111,6 +111,7 @@ static struct facility_def facility_defs[] = { 193, /* bear enhancement facility */ 194, /* rdp enhancement facility */ 196, /* processor activity instrumentation facility */ + 197, /* processor activity instrumentation extension 1 */ -1 /* END */ } }, diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 30788894124f..f969410d0c90 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -693,9 +693,9 @@ void x86_pmu_disable_all(void) } } -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { - return static_call(x86_pmu_guest_get_msrs)(nr); + return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_GPL(perf_guest_get_msrs); @@ -2103,14 +2103,15 @@ static int __init init_hw_perf_events(void) } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return 0; + err = 0; + goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) - return 0; + goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -2219,6 +2220,8 @@ out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); +out_bad_pmu: + memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); @@ -2990,6 +2993,11 @@ unsigned long perf_misc_flags(struct pt_regs *regs) void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { + if (!x86_pmu_initialized()) { + memset(cap, 0, sizeof(*cap)); + return; + } + cap->version = x86_pmu.version; /* * KVM doesn't support the hybrid PMU yet. @@ -3002,5 +3010,17 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; + cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); + +u64 perf_get_hw_event_config(int hw_event) +{ + int max = x86_pmu.max_events; + + if (hw_event < max) + return x86_pmu.event_map(array_index_nospec(hw_event, max)); + + return 0; +} +EXPORT_SYMBOL_GPL(perf_get_hw_event_config); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bd8b98857609..2db93498ff71 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/nmi.h> +#include <linux/kvm_host.h> #include <asm/cpufeature.h> #include <asm/hardirq.h> @@ -2852,6 +2853,47 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } +/* + * We may be running with guest PEBS events created by KVM, and the + * PEBS records are logged into the guest's DS and invisible to host. + * + * In the case of guest PEBS overflow, we only trigger a fake event + * to emulate the PEBS overflow PMI for guest PEBS counters in KVM. + * The guest will then vm-entry and check the guest DS area to read + * the guest PEBS records. + * + * The contents and other behavior of the guest event do not matter. + */ +static void x86_pmu_handle_guest_pebs(struct pt_regs *regs, + struct perf_sample_data *data) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 guest_pebs_idxs = cpuc->pebs_enabled & ~cpuc->intel_ctrl_host_mask; + struct perf_event *event = NULL; + int bit; + + if (!unlikely(perf_guest_state())) + return; + + if (!x86_pmu.pebs_ept || !x86_pmu.pebs_active || + !guest_pebs_idxs) + return; + + for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, + INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) { + event = cpuc->events[bit]; + if (!event->attr.precise_ip) + continue; + + perf_sample_data_init(data, 0, event->hw.last_period); + if (perf_event_overflow(event, data, regs)) + x86_pmu_stop(event, 0); + + /* Inject one fake event is enough. */ + break; + } +} + static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; @@ -2891,10 +2933,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - status &= ~cpuc->pebs_enabled; - else - status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); + status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); /* * PEBS overflow sets bit 62 in the global status register @@ -2903,6 +2942,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) u64 pebs_enabled = cpuc->pebs_enabled; handled++; + x86_pmu_handle_guest_pebs(regs, &data); x86_pmu.drain_pebs(regs, &data); status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; @@ -3930,40 +3970,98 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } -static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +/* + * Currently, the only caller of this function is the atomic_switch_perf_msrs(). + * The host perf conext helps to prepare the values of the real hardware for + * a set of msrs that need to be switched atomically in a vmx transaction. + * + * For example, the pseudocode needed to add a new msr should look like: + * + * arr[(*nr)++] = (struct perf_guest_switch_msr){ + * .msr = the hardware msr address, + * .host = the value the hardware has when it doesn't run a guest, + * .guest = the value the hardware has when it runs a guest, + * }; + * + * These values have nothing to do with the emulated values the guest sees + * when it uses {RD,WR}MSR, which should be handled by the KVM context, + * specifically in the intel_pmu_{get,set}_msr(). + */ +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); + u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; + int global_ctrl, pebs_enable; + + *nr = 0; + global_ctrl = (*nr)++; + arr[global_ctrl] = (struct perf_guest_switch_msr){ + .msr = MSR_CORE_PERF_GLOBAL_CTRL, + .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, + .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), + }; - arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; - arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask; - arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - arr[0].guest &= ~cpuc->pebs_enabled; - else - arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); - *nr = 1; + if (!x86_pmu.pebs) + return arr; - if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) { - /* - * If PMU counter has PEBS enabled it is not enough to - * disable counter on a guest entry since PEBS memory - * write can overshoot guest entry and corrupt guest - * memory. Disabling PEBS solves the problem. - * - * Don't do this if the CPU already enforces it. - */ - arr[1].msr = MSR_IA32_PEBS_ENABLE; - arr[1].host = cpuc->pebs_enabled; - arr[1].guest = 0; - *nr = 2; + /* + * If PMU counter has PEBS enabled it is not enough to + * disable counter on a guest entry since PEBS memory + * write can overshoot guest entry and corrupt guest + * memory. Disabling PEBS solves the problem. + * + * Don't do this if the CPU already enforces it. + */ + if (x86_pmu.pebs_no_isolation) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled, + .guest = 0, + }; + return arr; + } + + if (!kvm_pmu || !x86_pmu.pebs_ept) + return arr; + + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_DS_AREA, + .host = (unsigned long)cpuc->ds, + .guest = kvm_pmu->ds_area, + }; + + if (x86_pmu.intel_cap.pebs_baseline) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_PEBS_DATA_CFG, + .host = cpuc->pebs_data_cfg, + .guest = kvm_pmu->pebs_data_cfg, + }; + } + + pebs_enable = (*nr)++; + arr[pebs_enable] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, + .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, + }; + + if (arr[pebs_enable].host) { + /* Disable guest PEBS if host PEBS is enabled. */ + arr[pebs_enable].guest = 0; + } else { + /* Disable guest PEBS for cross-mapped PEBS counters. */ + arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ + arr[global_ctrl].guest |= arr[pebs_enable].guest; } return arr; } -static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; @@ -5650,6 +5748,7 @@ __init int intel_pmu_init(void) x86_pmu.events_mask_len = eax.split.mask_length; x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + x86_pmu.pebs_capable = PEBS_COUNTER_MASK; /* * Quirk: v2 perfmon does not report fixed-purpose events, so @@ -5834,6 +5933,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.get_event_constraints = glp_get_event_constraints; @@ -6138,6 +6238,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ICELAKE_D: + x86_pmu.pebs_ept = 1; pmem = true; fallthrough; case INTEL_FAM6_ICELAKE_L: @@ -6190,6 +6291,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.flags |= PMU_FL_PEBS_ALL; @@ -6235,6 +6337,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.flags |= PMU_FL_PEBS_ALL; @@ -6399,8 +6502,7 @@ __init int intel_pmu_init(void) x86_pmu.intel_ctrl); /* * Access LBR MSR may cause #GP under certain circumstances. - * E.g. KVM doesn't support LBR MSR - * Check all LBT MSR here. + * Check all LBR MSR here. * Disable LBR access if any LBR MSRs can not be accessed. */ if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL)) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ca2f8bfe6ff1..ba3d24a6a4ec 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -828,7 +828,8 @@ struct x86_pmu { pebs_prec_dist :1, pebs_no_tlb :1, pebs_no_isolation :1, - pebs_block :1; + pebs_block :1, + pebs_ept :1; int pebs_record_size; int pebs_buffer_size; int max_pebs_events; @@ -838,6 +839,7 @@ struct x86_pmu { u64 (*pebs_latency_data)(struct perf_event *event, u64 status); unsigned long large_pebs_flags; u64 rtm_abort_event; + u64 pebs_capable; /* * Intel LBR @@ -913,7 +915,7 @@ struct x86_pmu { /* * Intel host/guest support (KVM) */ - struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr, void *data); /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index db2d92fb44da..fb8b2c088681 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -46,7 +46,7 @@ static void hv_apic_icr_write(u32 low, u32 id) { u64 reg_val; - reg_val = SET_APIC_DEST_FIELD(id); + reg_val = SET_XAPIC_DEST_FIELD(id); reg_val = reg_val << 32; reg_val |= low; diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 92035eb3afee..68d213e83fcc 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -89,8 +89,8 @@ #define APIC_DM_EXTINT 0x00700 #define APIC_VECTOR_MASK 0x000FF #define APIC_ICR2 0x310 -#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) -#define SET_APIC_DEST_FIELD(x) ((x) << 24) +#define GET_XAPIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) +#define SET_XAPIC_DEST_FIELD(x) ((x) << 24) #define APIC_LVTT 0x320 #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5fe7f6c8a7a4..14ed039dff55 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -353,6 +353,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index da47f60a4650..51f777071584 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) KVM_X86_OP_OPTIONAL(vm_destroy) +KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) @@ -87,7 +88,7 @@ KVM_X86_OP(deliver_interrupt) KVM_X86_OP_OPTIONAL(sync_pir_to_irr) KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index fdfd8e06fee6..c17e3e96fc1d 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -12,7 +12,7 @@ BUILD_BUG_ON(1) * a NULL definition, for example if "static_call_cond()" will be used * at the call sites. */ -KVM_X86_PMU_OP(pmc_perf_hw_id) +KVM_X86_PMU_OP(hw_event_available) KVM_X86_PMU_OP(pmc_is_enabled) KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9217bd6cf0d1..e8281d64a431 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -65,6 +65,9 @@ #define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ KVM_BUS_LOCK_DETECTION_EXIT) +#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ + KVM_X86_NOTIFY_VMEXIT_USER) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -126,7 +129,6 @@ #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) -#define UNMAPPED_GVA (~(gpa_t)0) #define INVALID_GPA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ @@ -505,6 +507,7 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; + u64 fixed_ctr_ctrl_mask; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; @@ -520,6 +523,21 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + u64 ds_area; + u64 pebs_enable; + u64 pebs_enable_mask; + u64 pebs_data_cfg; + u64 pebs_data_cfg_mask; + + /* + * If a guest counter is cross-mapped to host counter with different + * index, its PEBS capability will be temporarily disabled. + * + * The user should make sure that this mask is updated + * after disabling interrupts and before perf_guest_get_msrs(); + */ + u64 host_cross_mapped_mask; + /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. @@ -644,7 +662,6 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ - bool apicv_active; bool load_eoi_exitmap_pending; DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; @@ -695,7 +712,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_shadow_page_cache; - struct kvm_mmu_memory_cache mmu_gfn_array_cache; + struct kvm_mmu_memory_cache mmu_shadowed_info_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* @@ -808,6 +825,7 @@ struct kvm_vcpu_arch { u64 mcg_ctl; u64 mcg_ext_ctl; u64 *mce_banks; + u64 *mci_ctl2_banks; /* Cache MMIO info */ u64 mmio_gva; @@ -1111,11 +1129,6 @@ enum kvm_apicv_inhibit { APICV_INHIBIT_REASON_PIT_REINJ, /* - * AVIC is inhibited because the guest has x2apic in its CPUID. - */ - APICV_INHIBIT_REASON_X2APIC, - - /* * AVIC is disabled because SEV doesn't support it. */ APICV_INHIBIT_REASON_SEV, @@ -1222,8 +1235,13 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool triple_fault_event; + bool bus_lock_detection_enabled; bool enable_pmu; + + u32 notify_window; + u32 notify_vmexit_flags; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace @@ -1307,6 +1325,36 @@ struct kvm_arch { hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; #endif + /* + * VM-scope maximum vCPU ID. Used to determine the size of structures + * that increase along with the maximum vCPU ID, in which case, using + * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. + */ + u32 max_vcpu_ids; + + bool disable_nx_huge_pages; + + /* + * Memory caches used to allocate shadow pages when performing eager + * page splitting. No need for a shadowed_info_cache since eager page + * splitting only allocates direct shadow pages. + * + * Protected by kvm->slots_lock. + */ + struct kvm_mmu_memory_cache split_shadow_page_cache; + struct kvm_mmu_memory_cache split_page_header_cache; + + /* + * Memory cache used to allocate pte_list_desc structs while splitting + * huge pages. In the worst case, to split one huge page, 512 + * pte_list_desc structs are needed to add each lower level leaf sptep + * to the rmap plus 1 to extend the parent_ptes rmap of the lower level + * page table. + * + * Protected by kvm->slots_lock. + */ +#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1) + struct kvm_mmu_memory_cache split_desc_cache; }; struct kvm_vm_stat { @@ -1367,6 +1415,7 @@ struct kvm_vcpu_stat { u64 preemption_reported; u64 preemption_other; u64 guest_mode; + u64 notify_window_exits; }; struct x86_instruction_info; @@ -1407,6 +1456,7 @@ struct kvm_x86_ops { void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ + int (*vcpu_precreate)(struct kvm *kvm); int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); @@ -1471,7 +1521,7 @@ struct kvm_x86_ops { u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*inject_irq)(struct kvm_vcpu *vcpu); + void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected); void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); @@ -1485,7 +1535,7 @@ struct kvm_x86_ops { bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); - void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + void (*hwapic_isr_update)(int isr); bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); @@ -1495,7 +1545,7 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); @@ -1705,21 +1755,6 @@ extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); -/* control of guest tsc rate supported? */ -extern bool kvm_has_tsc_control; -/* maximum supported tsc_khz for guests */ -extern u32 kvm_max_guest_tsc_khz; -/* number of bits of the fractional part of the TSC scaling ratio */ -extern u8 kvm_tsc_scaling_ratio_frac_bits; -/* maximum allowed value of TSC scaling ratio */ -extern u64 kvm_max_tsc_scaling_ratio; -/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ -extern u64 kvm_default_tsc_scaling_ratio; -/* bus lock detection supported? */ -extern bool kvm_has_bus_lock_exit; - -extern u64 kvm_mce_cap_supported; - /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context @@ -2060,6 +2095,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ - KVM_X86_QUIRK_FIX_HYPERCALL_INSN) + KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1ac0f9bf4b90..182b2a1f71fe 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -231,6 +231,12 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -1019,6 +1025,7 @@ #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 #define MSR_IA32_VMX_VMFUNC 0x00000491 +#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 34348ae41cdb..f6fc8dd51ef4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -222,6 +222,7 @@ struct x86_pmu_capability { int bit_width_fixed; unsigned int events_mask; int events_mask_len; + unsigned int pebs_ept :1; }; /* @@ -520,6 +521,7 @@ struct x86_pmu_lbr { }; extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern u64 perf_get_hw_event_config(int hw_event); extern void perf_check_microcode(void); extern void perf_clear_dirty_counters(void); extern int x86_perf_rdpmc_index(struct perf_event *event); @@ -529,15 +531,20 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) memset(cap, 0, sizeof(*cap)); } +static inline u64 perf_get_hw_event_config(int hw_event) +{ + return 0; +} + static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) -extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { return -1; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1b07fba11704..0361626841bc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -195,6 +195,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) +#define X2APIC_MODE_SHIFT 30 +#define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT) + #define LBR_CTL_ENABLE_MASK BIT_ULL(0) #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) @@ -253,12 +256,19 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, }; +#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(9, 0) + +/* + * For AVIC, the max index allowed for physical APIC ID + * table is 0xff (255). + */ +#define AVIC_MAX_PHYSICAL_ID 0XFEULL /* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. + * For x2AVIC, the max index allowed for physical APIC ID + * table is 0x1ff (511). */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff +#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 6c343c6a1855..c371ef695fcc 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -31,6 +31,7 @@ #define CPU_BASED_RDTSC_EXITING VMCS_CONTROL_BIT(RDTSC_EXITING) #define CPU_BASED_CR3_LOAD_EXITING VMCS_CONTROL_BIT(CR3_LOAD_EXITING) #define CPU_BASED_CR3_STORE_EXITING VMCS_CONTROL_BIT(CR3_STORE_EXITING) +#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS VMCS_CONTROL_BIT(TERTIARY_CONTROLS) #define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING) #define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING) #define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR) @@ -74,6 +75,12 @@ #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) #define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) +#define SECONDARY_EXEC_NOTIFY_VM_EXITING VMCS_CONTROL_BIT(NOTIFY_VM_EXITING) + +/* + * Definitions of Tertiary Processor-Based VM-Execution Controls. + */ +#define TERTIARY_EXEC_IPI_VIRT VMCS_CONTROL_BIT(IPI_VIRT) #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) @@ -158,6 +165,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc) enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, POSTED_INTR_NV = 0x00000002, + LAST_PID_POINTER_INDEX = 0x00000008, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -221,6 +229,10 @@ enum vmcs_field { ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, + TERTIARY_VM_EXEC_CONTROL = 0x00002034, + TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + PID_POINTER_TABLE = 0x00002042, + PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -269,6 +281,7 @@ enum vmcs_field { SECONDARY_VM_EXEC_CONTROL = 0x0000401e, PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, + NOTIFY_WINDOW = 0x00004024, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -554,6 +567,11 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) /* + * Exit Qualifications for NOTIFY VM EXIT + */ +#define NOTIFY_VM_CONTEXT_INVALID BIT(0) + +/* * VM-instruction error numbers */ enum vm_instruction_error_number { diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index d9a74681a77d..c6a7eed03914 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -5,7 +5,7 @@ /* * Defines VMX CPU feature bits */ -#define NVMXINTS 3 /* N 32-bit words worth of info */ +#define NVMXINTS 5 /* N 32-bit words worth of info */ /* * Note: If the comment begins with a quoted string, that string is used @@ -43,6 +43,7 @@ #define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */ #define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */ #define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */ +#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* "" Enable Tertiary VM-Execution Controls */ #define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */ #define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */ #define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */ @@ -84,5 +85,8 @@ #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ #define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ +#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* VM-Exit when no event windows after notify window */ +/* Tertiary Processor-Based VM-Execution Controls, word 3 */ +#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ec53c9fa1da9..46de10a809ec 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -306,7 +306,8 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002 struct kvm_pit_state2 { struct kvm_pit_channel_state channels[3]; @@ -325,6 +326,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 +#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -359,7 +361,10 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u8 reserved[27]; + struct { + __u8 pending; + } triple_fault; + __u8 reserved[26]; __u8 exception_has_payload; __u64 exception_payload; }; @@ -434,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) +#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 946d761adbd3..a5faf6d88f1b 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -91,6 +91,7 @@ #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 +#define EXIT_REASON_NOTIFY 75 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -153,7 +154,8 @@ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ - { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ + { EXIT_REASON_NOTIFY, "NOTIFY" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 189d3a5e471a..a4347605ab00 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -275,7 +275,7 @@ void native_apic_icr_write(u32 low, u32 id) unsigned long flags; local_irq_save(flags); - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index d1fb874fbe64..2a6509e8c840 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -99,7 +99,7 @@ sendmask: static inline int __prepare_ICR2(unsigned int mask) { - return SET_APIC_DEST_FIELD(mask); + return SET_XAPIC_DEST_FIELD(mask); } static inline void __xapic_wait_icr_idle(void) diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index da696eb4821a..993697e71854 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -15,6 +15,8 @@ enum vmx_feature_leafs { MISC_FEATURES = 0, PRIMARY_CTLS, SECONDARY_CTLS, + TERTIARY_CTLS_LOW, + TERTIARY_CTLS_HIGH, NR_VMX_FEATURE_WORDS, }; @@ -22,7 +24,7 @@ enum vmx_feature_leafs { static void init_vmx_capabilities(struct cpuinfo_x86 *c) { - u32 supported, funcs, ept, vpid, ign; + u32 supported, funcs, ept, vpid, ign, low, high; BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); @@ -42,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); c->vmx_capability[SECONDARY_CTLS] = supported; + /* All 64 bits of tertiary controls MSR are allowed-1 settings. */ + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high); + c->vmx_capability[TERTIARY_CTLS_LOW] = low; + c->vmx_capability[TERTIARY_CTLS_HIGH] = high; + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1a3658f7e6d9..d4e48b4a438b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -236,8 +236,7 @@ again: raw_spin_unlock(&b->lock); /* A dummy token might be allocated and ultimately not used. */ - if (dummy) - kfree(dummy); + kfree(dummy); } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index de6d44e07e34..75dcf7a72605 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -67,9 +67,17 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() + * to avoid false positives when processing guest CPUID input. + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( - struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -77,9 +85,31 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( for (i = 0; i < nent; i++) { e = &entries[i]; - if (e->function == function && - (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)) + if (e->function != function) + continue; + + /* + * If the index isn't significant, use the first entry with a + * matching function. It's userspace's responsibilty to not + * provide "duplicate" entries in all cases. + */ + if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) + return e; + + + /* + * Similarly, use the first matching entry if KVM is doing a + * lookup (as opposed to emulating CPUID) for a function that's + * architecturally defined as not having a significant index. + */ + if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { + /* + * Direct lookups from KVM should not diverge from what + * KVM defines internally (the architectural behavior). + */ + WARN_ON_ONCE(cpuid_function_is_indexed(function)); return e; + } } return NULL; @@ -96,7 +126,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -151,7 +182,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) vcpu->arch.kvm_cpuid_base = 0; for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function, 0); + entry = kvm_find_cpuid_entry(vcpu, function); if (entry) { u32 signature[3]; @@ -177,7 +208,8 @@ static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *v if (!base) return NULL; - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) @@ -200,7 +232,7 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) /* * Calculate guest's supported XCR0 taking into account guest CPUID data and - * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0). + * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). */ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) { @@ -210,7 +242,7 @@ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) if (!best) return 0; - return (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, @@ -219,7 +251,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); - best = cpuid_entry2_find(entries, nent, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -250,7 +282,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = cpuid_entry2_find(entries, nent, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & @@ -285,7 +317,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -325,10 +357,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: @@ -868,7 +900,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) case 9: break; case 0xa: { /* Architectural Performance Monitoring */ - struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; @@ -877,30 +908,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } - perf_get_x86_pmu_capability(&cap); + eax.split.version_id = kvm_pmu_cap.version; + eax.split.num_counters = kvm_pmu_cap.num_counters_gp; + eax.split.bit_width = kvm_pmu_cap.bit_width_gp; + eax.split.mask_length = kvm_pmu_cap.events_mask_len; + edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; + edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; - /* - * The guest architecture pmu is only supported if the architecture - * pmu exists on the host and the module parameters allow it. - */ - if (!cap.version || !enable_pmu) - memset(&cap, 0, sizeof(cap)); - - eax.split.version_id = min(cap.version, 2); - eax.split.num_counters = cap.num_counters_gp; - eax.split.bit_width = cap.bit_width_gp; - eax.split.mask_length = cap.events_mask_len; - - edx.split.num_counters_fixed = - min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED); - edx.split.bit_width_fixed = cap.bit_width_fixed; - if (cap.version) + if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; entry->eax = eax.full; - entry->ebx = cap.events_mask; + entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; @@ -923,8 +944,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xd: { - u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm(); - u64 permitted_xss = supported_xss; + u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xss = kvm_caps.supported_xss; entry->eax &= permitted_xcr0; entry->ebx = xstate_required_size(permitted_xcr0, false); @@ -1313,12 +1334,20 @@ out_free: return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, index); } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* @@ -1355,7 +1384,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; - basic = kvm_find_cpuid_entry(vcpu, 0, 0); + basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; @@ -1364,11 +1393,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) - class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) - class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); + class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else - class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; @@ -1386,7 +1415,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ - return kvm_find_cpuid_entry(vcpu, basic->eax, index); + return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, @@ -1396,7 +1425,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; - entry = kvm_find_cpuid_entry(vcpu, function, index); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { @@ -1425,7 +1454,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { - entry = kvm_find_cpuid_entry(vcpu, function, 1); + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 8a770b481d9d..b1658c0de847 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -13,8 +13,10 @@ void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); + u32 function); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -76,7 +78,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; - entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); + entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; @@ -109,7 +111,7 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); @@ -119,7 +121,7 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } @@ -127,7 +129,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; @@ -138,18 +140,23 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_model(best->eax); } +static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu) +{ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 9240b3b7f8dd..cfed36aba2f7 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -48,7 +48,7 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) { - *val = kvm_tsc_scaling_ratio_frac_bits; + *val = kvm_caps.tsc_scaling_ratio_frac_bits; return 0; } @@ -66,7 +66,7 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ debugfs_dentry, vcpu, &vcpu_timer_advance_ns_fops); - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { debugfs_create_file("tsc-scaling-ratio", 0444, debugfs_dentry, vcpu, &vcpu_tsc_scaling_fops); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f8382abe22ff..047c583596bb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -244,6 +244,9 @@ enum x86_transfer_type { static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + if (!(ctxt->regs_valid & (1 << nr))) { ctxt->regs_valid |= 1 << nr; ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); @@ -253,6 +256,12 @@ static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + ctxt->regs_valid |= 1 << nr; ctxt->regs_dirty |= 1 << nr; return &ctxt->_regs[nr]; @@ -266,9 +275,10 @@ static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) static void writeback_registers(struct x86_emulate_ctxt *ctxt) { + unsigned long dirty = ctxt->regs_dirty; unsigned reg; - for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) + for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); } @@ -615,7 +625,9 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { - WARN_ON(vec > 0x1f); + if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt)) + return X86EMUL_UNHANDLEABLE; + ctxt->exception.vector = vec; ctxt->exception.error_code = error; ctxt->exception.error_code_valid = valid; @@ -1362,7 +1374,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - WARN_ON((mc->end + size) >= sizeof(mc->data)); + if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt)) + return X86EMUL_UNHANDLEABLE; rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, &ctxt->exception); @@ -1687,16 +1700,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; - if (!seg_desc.p) { - err_vec = NP_VECTOR; - goto exception; - } - old_desc = seg_desc; - seg_desc.type |= 2; /* busy */ - ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, - sizeof(seg_desc), &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) @@ -1734,8 +1737,17 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | - ((u64)base3 << 32), ctxt)) - return emulate_gp(ctxt, 0); + ((u64)base3 << 32), ctxt)) + return emulate_gp(ctxt, err_code); + } + + if (seg == VCPU_SREG_TR) { + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); @@ -2432,7 +2444,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - for (i = 0; i < 8; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); @@ -2489,7 +2501,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u16 selector; int i, r; - for (i = 0; i < 16; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); @@ -5719,7 +5731,8 @@ writeback: done: if (rc == X86EMUL_PROPAGATE_FAULT) { - WARN_ON(ctxt->exception.vector > 0x1f); + if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt)) + return EMULATION_FAILED; ctxt->have_exception = true; } if (rc == X86EMUL_INTERCEPTED) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e2e95a6fccfd..ed804447589c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1992,7 +1992,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; struct kvm_vcpu_hv *hv_vcpu; - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { vcpu->arch.hyperv_enabled = true; } else { @@ -2005,7 +2005,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu = to_hv_vcpu(vcpu); - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; @@ -2016,7 +2016,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.features_edx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; @@ -2025,7 +2025,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.enlightenments_ebx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; else diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 1c83076091af..e0a7a0e7a73c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -591,7 +591,10 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu, return -EOPNOTSUPP; mutex_lock(&pit_state->lock); - pit_state->speaker_data_on = (val >> 1) & 1; + if (val & (1 << 1)) + pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON; + else + pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON; pit_set_gate(pit, 2, val & 1); mutex_unlock(&pit_state->lock); return 0; @@ -612,8 +615,9 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | - (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); + ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) | + pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) | + (refresh_clock << 4); if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 394d9527da7e..a768212ba821 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -29,7 +29,6 @@ struct kvm_kpit_state { bool is_periodic; s64 period; /* unit: ns */ struct hrtimer timer; - u32 speaker_data_on; struct mutex lock; atomic_t reinject; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 8dff25d267b7..89246446d6aa 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -89,6 +89,7 @@ struct x86_instruction_info { #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ struct x86_emulate_ops { + void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* * read_gpr: read a general purpose register (rax - r15) * @@ -301,6 +302,18 @@ struct fastop; typedef void (*fastop_t)(struct fastop *); +/* + * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is + * tracked/accessed via _eip, and except for RIP relative addressing, which + * also uses _eip, RIP cannot be a register operand nor can it be an operand in + * a ModRM or SIB byte. + */ +#ifdef CONFIG_X86_64 +#define NR_EMULATOR_GPRS 16 +#else +#define NR_EMULATOR_GPRS 8 +#endif + struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; @@ -345,9 +358,9 @@ struct x86_emulate_ctxt { u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ - u32 regs_valid; + u16 regs_valid; /* bitmaps of registers in _regs[] that have been written */ - u32 regs_dirty; + u16 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; @@ -363,7 +376,7 @@ struct x86_emulate_ctxt { struct operand src2; struct operand dst; struct operand memop; - unsigned long _regs[NR_VCPU_REGS]; + unsigned long _regs[NR_EMULATOR_GPRS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; @@ -371,6 +384,15 @@ struct x86_emulate_ctxt { bool is_branch; }; +#define KVM_EMULATOR_BUG_ON(cond, ctxt) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ON_ONCE(__ret)) \ + ctxt->ops->vm_bugged(ctxt); \ + unlikely(__ret); \ +}) + /* Repeat String Operation Prefix */ #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0e68b4c937fc..e2ce3556915e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -27,6 +27,7 @@ #include <linux/math64.h> #include <linux/slab.h> #include <asm/processor.h> +#include <asm/mce.h> #include <asm/msr.h> #include <asm/page.h> #include <asm/current.h> @@ -54,7 +55,7 @@ #define PRIo64 "o" /* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) +#define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define MAX_APIC_VECTOR 256 @@ -67,6 +68,8 @@ static bool lapic_timer_advance_dynamic __read_mostly; #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 +static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); +static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) { @@ -398,14 +401,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index) +{ + return apic->nr_lvt_entries > lvt_index; +} + +static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu) +{ + return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P); +} + void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 v = APIC_VERSION; + u32 v = 0; if (!lapic_in_kernel(vcpu)) return; + v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); + /* * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) * which doesn't have EOI register; Some buggy OSes (e.g. Windows with @@ -419,12 +434,33 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) kvm_lapic_set_reg(apic, APIC_LVR, v); } -static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { - LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ - LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ - LVT_MASK | APIC_MODE_MASK, /* LVTPC */ - LINT_MASK, LINT_MASK, /* LVT0-1 */ - LVT_MASK /* LVTERR */ +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu) +{ + int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + int i; + + if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries) + return; + + /* Initialize/mask any "new" LVT entries. */ + for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++) + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); + + apic->nr_lvt_entries = nr_lvt_entries; + + /* The number of LVT entries is reflected in the version register. */ + kvm_apic_set_version(vcpu); +} + +static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { + [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */ + [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK, + [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK, + [LVT_LINT0] = LINT_MASK, + [LVT_LINT1] = LINT_MASK, + [LVT_ERROR] = LVT_MASK, + [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK }; static int find_highest_vector(void *bitmap) @@ -518,14 +554,11 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - - vcpu = apic->vcpu; - - if (unlikely(vcpu->arch.apicv_active)) { + if (unlikely(apic->apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu, + apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -542,20 +575,16 @@ EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * With APIC virtualization enabled, all caching is disabled * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec); + if (unlikely(apic->apicv_active)) + static_call_cond(kvm_x86_hwapic_isr_update)(vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -589,12 +618,9 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * We do get here for APIC virtualization enabled if the guest * uses the Hyper-V APIC enlightenment. In this case we may need @@ -602,8 +628,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + if (unlikely(apic->apicv_active)) + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -801,17 +827,17 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) if (kvm_apic_broadcast(apic, mda)) return true; - if (apic_x2apic_mode(apic)) - return mda == kvm_x2apic_id(apic); - /* - * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if - * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and - * this allows unique addressing of VCPUs with APIC ID over 0xff. - * The 0xff condition is needed because writeable xAPIC ID. + * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they + * were in x2APIC mode if the target APIC ID can't be encoded as an + * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which + * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC + * mode. Match the x2APIC ID if and only if the target APIC ID can't + * be encoded in xAPIC to avoid spurious matches against a vCPU that + * changed its (addressable) xAPIC ID (which is writable). */ - if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) - return true; + if (apic_x2apic_mode(apic) || mda > 0xff) + return mda == kvm_x2apic_id(apic); return mda == kvm_xapic_id(apic); } @@ -1325,7 +1351,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) if (apic_x2apic_mode(apic)) irq.dest_id = icr_high; else - irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); trace_kvm_apic_ipi(icr_low, irq.dest_id); @@ -1444,6 +1470,9 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, APIC_REG_MASK(APIC_TMCCT) | APIC_REG_MASK(APIC_TDCR); + if (kvm_lapic_lvt_supported(apic, LVT_CMCI)) + valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); + /* * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be @@ -1583,7 +1612,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; - if (vcpu->arch.apicv_active) + if (apic->apicv_active) bitmap = apic->regs + APIC_IRR; if (apic_test_vector(vec, bitmap)) @@ -1602,7 +1631,7 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) * that __delay() uses delay_tsc whenever the hardware has TSC, thus * always for VMX enabled hardware. */ - if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) { __delay(min(guest_cycles, nsec_to_cycles(vcpu, timer_advance_ns))); } else { @@ -1700,7 +1729,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; - if (!from_timer_fn && vcpu->arch.apicv_active) { + if (!from_timer_fn && apic->apicv_active) { WARN_ON(kvm_get_running_vcpu() != vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; @@ -2052,6 +2081,16 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); } +static int get_lvt_index(u32 reg) +{ + if (reg == APIC_LVTCMCI) + return LVT_CMCI; + if (reg < APIC_LVTT || reg > APIC_LVTERR) + return -1; + return array_index_nospec( + (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES); +} + static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; @@ -2098,13 +2137,10 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; - u32 lvt_val; - for (i = 0; i < KVM_APIC_LVT_NUM; i++) { - lvt_val = kvm_lapic_get_reg(apic, - APIC_LVTT + 0x10 * i); - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, - lvt_val | APIC_LVT_MASKED); + for (i = 0; i < apic->nr_lvt_entries; i++) { + kvm_lapic_set_reg(apic, APIC_LVTx(i), + kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED); } apic_update_lvtt(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -2133,16 +2169,15 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: - case APIC_LVTERR: { - /* TODO: Check vector */ - size_t size; - u32 index; - + case APIC_LVTERR: + case APIC_LVTCMCI: { + u32 index = get_lvt_index(reg); + if (!kvm_lapic_lvt_supported(apic, index)) { + ret = 1; + break; + } if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - size = ARRAY_SIZE(apic_lvt_mask); - index = array_index_nospec( - (reg - APIC_LVTT) >> 4, size); val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); break; @@ -2246,10 +2281,26 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { - u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset); + struct kvm_lapic *apic = vcpu->arch.apic; + u64 val; + + if (apic_x2apic_mode(apic)) + kvm_lapic_msr_read(apic, offset, &val); + else + val = kvm_lapic_get_reg(apic, offset); - /* TODO: optimize to just emulate side effect w/o one more write */ - kvm_lapic_reg_write(vcpu->arch.apic, offset, val); + /* + * ICR is a single 64-bit register when x2APIC is enabled. For legacy + * xAPIC, ICR writes need to go down the common (slightly slower) path + * to get the upper half from ICR2. + */ + if (apic_x2apic_mode(apic) && offset == APIC_ICR) { + kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); + trace_kvm_apic_write(APIC_ICR, val); + } else { + /* TODO: optimize to just emulate side effect w/o one more write */ + kvm_lapic_reg_write(apic, offset, (u32)val); + } } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); @@ -2344,8 +2395,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); - if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) + if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { + kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); + } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2361,7 +2414,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { /* irr_pending is always true when apicv is activated. */ apic->irr_pending = true; apic->isr_count = 1; @@ -2401,8 +2454,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); - for (i = 0; i < KVM_APIC_LVT_NUM; i++) - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + for (i = 0; i < apic->nr_lvt_entries; i++) + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) && kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) @@ -2436,10 +2489,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); + static_call_cond(kvm_x86_hwapic_isr_update)(-1); } vcpu->arch.apic_arb_prio = 0; @@ -2532,6 +2585,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) } apic->vcpu = vcpu; + apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; @@ -2716,10 +2771,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 65bb2a8cf145..117a46df5cc1 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -10,7 +10,6 @@ #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 -#define KVM_APIC_LVT_NUM 6 #define APIC_SHORT_MASK 0xc0000 #define APIC_DEST_NOSHORT 0x0 @@ -29,6 +28,20 @@ enum lapic_mode { LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE, }; +enum lapic_lvt_entry { + LVT_TIMER, + LVT_THERMAL_MONITOR, + LVT_PERFORMANCE_COUNTER, + LVT_LINT0, + LVT_LINT1, + LVT_ERROR, + LVT_CMCI, + + KVM_APIC_MAX_NR_LVT_ENTRIES, +}; + +#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x)) + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -48,6 +61,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool apicv_active; bool sw_enabled; bool irr_pending; bool lvt0_in_nmi_mode; @@ -65,6 +79,7 @@ struct kvm_lapic { struct gfn_to_hva_cache vapic_cache; unsigned long pending_events; unsigned int sipi_vector; + int nr_lvt_entries; }; struct dest_map; @@ -84,6 +99,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); @@ -204,7 +220,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic && vcpu->arch.apicv_active; + return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active; } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f8192864b496..a99acec925eb 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -6,11 +6,6 @@ #include "kvm_cache_regs.h" #include "cpuid.h" -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) - #define PT_WRITABLE_SHIFT 1 #define PT_USER_SHIFT 2 @@ -34,11 +29,6 @@ #define PT_DIR_PAT_SHIFT 12 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK \ - (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - #define PT64_ROOT_5LEVEL 5 #define PT64_ROOT_4LEVEL 4 #define PT32_ROOT_LEVEL 2 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17252f39bd7c..4236a28b9be5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -53,8 +53,6 @@ #include <asm/kvm_page_track.h> #include "trace.h" -#include "paging.h" - extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; @@ -111,26 +109,6 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) - -#define PT32_LVL_OFFSET_MASK(level) \ - (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) - - -#define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) -#define PT32_LVL_ADDR_MASK(level) \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - #include <trace/events/kvm.h> /* make pte_list_desc fit well in cache lines */ @@ -326,13 +304,6 @@ static int is_cpuid_PSE36(void) return 1; } -static gfn_t pse36_gfn_delta(u32 gpte) -{ - int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; - - return (gpte & PT32_DIR_PSE36_MASK) << shift; -} - #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { @@ -432,7 +403,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * - * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * An spte tlb flush may be pending, because kvm_set_pte_rmap * coalesces them and we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * @@ -558,11 +529,12 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) +static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; + struct page *page; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) @@ -578,11 +550,13 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) pfn = spte_to_pfn(old_spte); /* - * KVM does not hold the refcount of the page used by - * kvm mmu, before reclaiming the page, we should - * unmap it from mmu first. + * KVM doesn't hold a reference to any pages mapped into the guest, and + * instead uses the mmu_notifier to ensure that KVM unmaps any pages + * before they are reclaimed. Sanity check that, if the pfn is backed + * by a refcounted page, the refcount is elevated. */ - WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); + page = kvm_pfn_to_refcounted_page(pfn); + WARN_ON(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); @@ -682,7 +656,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) if (r) return r; if (maybe_indirect) { - r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; @@ -695,48 +669,79 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } -static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) -{ - return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); -} - static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { kmem_cache_free(pte_list_desc_cache, pte_list_desc); } +static bool sp_has_gptes(struct kvm_mmu_page *sp); + static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { if (sp->role.passthrough) return sp->gfn; if (!sp->role.direct) - return sp->gfns[index]; + return sp->shadowed_translation[index] >> PAGE_SHIFT; - return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); + return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } -static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +/* + * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note + * that the SPTE itself may have a more constrained access permissions that + * what the guest enforces. For example, a guest may create an executable + * huge PTE but KVM may disallow execution to mitigate iTLB multihit. + */ +static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp->role.passthrough) { - WARN_ON_ONCE(gfn != sp->gfn); - return; - } + if (sp_has_gptes(sp)) + return sp->shadowed_translation[index] & ACC_ALL; - if (!sp->role.direct) { - sp->gfns[index] = gfn; + /* + * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, + * KVM is not shadowing any guest page tables, so the "guest access + * permissions" are just ACC_ALL. + * + * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM + * is shadowing a guest huge page with small pages, the guest access + * permissions being shadowed are the access permissions of the huge + * page. + * + * In both cases, sp->role.access contains the correct access bits. + */ + return sp->role.access; +} + +static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, + gfn_t gfn, unsigned int access) +{ + if (sp_has_gptes(sp)) { + sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } - if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) - pr_err_ratelimited("gfn mismatch under direct page %llx " - "(expected %llx, got %llx)\n", - sp->gfn, - kvm_mmu_page_get_gfn(sp, index), gfn); + WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), + "access mismatch under %s page %llx (expected %u, got %u)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_access(sp, index), access); + + WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), + "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); +} + +static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, + unsigned int access) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); + + kvm_mmu_page_set_translation(sp, index, gfn, access); } /* @@ -792,6 +797,9 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) KVM_PAGE_TRACK_WRITE); kvm_mmu_gfn_disallow_lpage(slot, gfn); + + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -855,7 +863,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, /* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, +static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; @@ -866,7 +874,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { rmap_printk("%p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_pte_list_desc(vcpu); + desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; @@ -878,7 +886,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; if (!desc->more) { - desc->more = mmu_alloc_pte_list_desc(vcpu); + desc->more = kvm_mmu_memory_cache_alloc(cache); desc = desc->more; desc->spte_count = 0; break; @@ -913,7 +921,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(desc); } -static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; @@ -949,15 +957,16 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - u64 *sptep) +static void kvm_zap_one_rmap_spte(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); - __pte_list_remove(sptep, rmap_head); + pte_list_remove(sptep, rmap_head); } -/* Return true if rmap existed, false otherwise */ -static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +/* Return true if at least one SPTE was zapped, false otherwise */ +static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; @@ -1030,7 +1039,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); - gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); /* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU @@ -1042,7 +1051,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - __pte_list_remove(spte, rmap_head); + pte_list_remove(spte, rmap_head); } /* @@ -1129,26 +1138,18 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } - -static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) { - if (is_large_pte(*sptep)) { - WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); - drop_spte(kvm, sptep); - return true; - } + struct kvm_mmu_page *sp; - return false; -} + sp = sptep_to_sp(sptep); + WARN_ON(sp->role.level == PG_LEVEL_4K); -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = sptep_to_sp(sptep); + drop_spte(kvm, sptep); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); - } } /* @@ -1383,22 +1384,22 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) +static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) { - return pte_list_destroy(kvm, rmap_head); + return kvm_zap_all_rmap_sptes(kvm, rmap_head); } -static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { - return kvm_zap_rmapp(kvm, rmap_head, slot); + return __kvm_zap_rmap(kvm, rmap_head, slot); } -static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t pte) +static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t pte) { u64 *sptep; struct rmap_iterator iter; @@ -1417,7 +1418,7 @@ restart: need_flush = true; if (pte_write(pte)) { - pte_list_remove(kvm, rmap_head, sptep); + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( @@ -1529,7 +1530,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); if (is_tdp_mmu_enabled(kvm)) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1542,7 +1543,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); @@ -1550,9 +1551,9 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return flush; } -static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1564,9 +1565,9 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return young; } -static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t unused) +static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1579,31 +1580,43 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn) +static void __rmap_add(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; int rmap_count; sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); + kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); + kvm_update_page_stats(kvm, sp->role.level, 1); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(vcpu, spte, rmap_head); + rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > RMAP_RECYCLE_THRESHOLD) { - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + kvm_zap_all_rmap_sptes(kvm, rmap_head); kvm_flush_remote_tlbs_with_address( - vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } } +static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) +{ + struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; + + __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1616,7 +1629,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -1652,14 +1665,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); if (!sp->role.direct) - free_page((unsigned long)sp->gfns); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1668,19 +1681,19 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, +static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(vcpu, parent_pte, &sp->parent_ptes); + pte_list_add(cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - __pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm_mmu_page *sp, @@ -1690,27 +1703,6 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) -{ - struct kvm_mmu_page *sp; - - sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); - if (!direct) - sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() - * depends on valid pages being added to the head of the list. See - * comments in kvm_zap_obsolete_pages(). - */ - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); - return sp; -} - static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { @@ -1725,11 +1717,9 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) static void mark_unsync(u64 *spte) { struct kvm_mmu_page *sp; - unsigned int index; sp = sptep_to_sp(spte); - index = spte - sp->spt; - if (__test_and_set_bit(index, sp->unsync_child_bitmap)) + if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) return; if (sp->unsync_children++) return; @@ -1789,7 +1779,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2019,36 +2009,24 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sptep_to_sp(spte)); } -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - int direct, - unsigned int access) +/* + * The vCPU is required when finding indirect shadow pages; the shadow + * page may already exist and syncing it needs the vCPU pointer in + * order to read guest page tables. Direct shadow pages are never + * unsync, thus @vcpu can be NULL if @role.direct is true. + */ +static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) { - bool direct_mmu = vcpu->arch.mmu->root_role.direct; - union kvm_mmu_page_role role; - struct hlist_head *sp_list; - unsigned quadrant; struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu->root_role; - role.level = level; - role.direct = direct; - role.access = access; - if (role.has_4_byte_gpte) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; - role.quadrant = quadrant; - } - if (level <= vcpu->arch.mmu->cpu_role.base.level) - role.passthrough = 0; - - sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; - for_each_valid_sp(vcpu->kvm, sp, sp_list) { + for_each_valid_sp(kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2064,16 +2042,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ - if (level > PG_LEVEL_4K && sp->unsync) - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + if (role.level > PG_LEVEL_4K && sp->unsync) + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; } - if (direct_mmu) - goto trace_get_page; + /* unsync and write-flooding only apply to indirect SPs. */ + if (sp->role.direct) + goto out; if (sp->unsync) { + if (KVM_BUG_ON(!vcpu, kvm)) + break; + /* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) @@ -2092,37 +2074,160 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, WARN_ON(!list_empty(&invalid_list)); if (ret > 0) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs(kvm); } __clear_sp_write_flooding_count(sp); -trace_get_page: - trace_kvm_mmu_get_page(sp, false); goto out; } - ++vcpu->kvm->stat.mmu_cache_miss; + sp = NULL; + ++kvm->stat.mmu_cache_miss; + +out: + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (collisions > kvm->stat.max_mmu_page_hash_collisions) + kvm->stat.max_mmu_page_hash_collisions = collisions; + return sp; +} + +/* Caches used when allocating a new shadow page. */ +struct shadow_page_caches { + struct kvm_mmu_memory_cache *page_header_cache; + struct kvm_mmu_memory_cache *shadow_page_cache; + struct kvm_mmu_memory_cache *shadowed_info_cache; +}; + +static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, + struct shadow_page_caches *caches, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); + if (!role.direct) + sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); + + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - sp = kvm_mmu_alloc_page(vcpu, direct); + /* + * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() + * depends on valid pages being added to the head of the list. See + * comments in kvm_zap_obsolete_pages(). + */ + sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; + list_add(&sp->link, &kvm->arch.active_mmu_pages); + kvm_mod_used_mmu_pages(kvm, +1); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); - if (sp_has_gptes(sp)) { - account_shadowed(vcpu->kvm, sp); - if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); + if (sp_has_gptes(sp)) + account_shadowed(kvm, sp); + + return sp; +} + +/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ +static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct shadow_page_caches *caches, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct hlist_head *sp_list; + struct kvm_mmu_page *sp; + bool created = false; + + sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + + sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); + if (!sp) { + created = true; + sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); } - trace_kvm_mmu_get_page(sp, true); -out: - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) - vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; + trace_kvm_mmu_get_page(sp, created); return sp; } +static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct shadow_page_caches caches = { + .page_header_cache = &vcpu->arch.mmu_page_header_cache, + .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, + .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, + }; + + return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); +} + +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, + unsigned int access) +{ + struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); + union kvm_mmu_page_role role; + + role = parent_sp->role; + role.level--; + role.access = access; + role.direct = direct; + role.passthrough = 0; + + /* + * If the guest has 4-byte PTEs then that means it's using 32-bit, + * 2-level, non-PAE paging. KVM shadows such guests with PAE paging + * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must + * shadow each guest page table with multiple shadow page tables, which + * requires extra bookkeeping in the role. + * + * Specifically, to shadow the guest's page directory (which covers a + * 4GiB address space), KVM uses 4 PAE page directories, each mapping + * 1GiB of the address space. @role.quadrant encodes which quarter of + * the address space each maps. + * + * To shadow the guest's page tables (which each map a 4MiB region), KVM + * uses 2 PAE page tables, each mapping a 2MiB region. For these, + * @role.quadrant encodes which half of the region they map. + * + * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE + * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow + * PDPTEs; those 4 PAE page directories are pre-allocated and their + * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes + * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume + * bit 21 in the PTE (the child here), KVM propagates that bit to the + * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE + * covers bit 21 (see above), thus the quadrant is calculated from the + * _least_ significant bit of the PDE index. + */ + if (role.has_4_byte_gpte) { + WARN_ON_ONCE(role.level != PG_LEVEL_4K); + role.quadrant = spte_index(sptep) & 1; + } + + return role; +} + +static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) +{ + union kvm_mmu_page_role role; + + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return ERR_PTR(-EEXIST); + + role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); +} + static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) @@ -2145,7 +2250,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; - iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; @@ -2164,7 +2269,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) if (iterator->level < PG_LEVEL_4K) return false; - iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->index = SPTE_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; } @@ -2177,7 +2282,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, return; } - iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; + iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; --iterator->level; } @@ -2186,23 +2291,38 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, - struct kvm_mmu_page *sp) +static void __link_shadow_page(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, u64 *sptep, + struct kvm_mmu_page *sp, bool flush) { u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + /* + * If an SPTE is present already, it must be a leaf and therefore + * a large one. Drop it, and flush the TLB if needed, before + * installing sp. + */ + if (is_shadow_present_pte(*sptep)) + drop_large_spte(kvm, sptep, flush); + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(vcpu, sp, sptep); + mmu_page_add_parent_pte(cache, sp, sptep); if (sp->unsync_children || sp->unsync) mark_unsync(sptep); } +static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) +{ + __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); +} + static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { @@ -2216,7 +2336,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2237,7 +2357,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, spte); /* @@ -2263,7 +2383,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, int zapped = 0; unsigned i; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); return zapped; @@ -2396,7 +2516,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_free_page(sp); + kvm_mmu_free_shadow_page(sp); } } @@ -2676,7 +2796,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -2711,8 +2831,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); - kvm_update_page_stats(vcpu->kvm, level, 1); - rmap_add(vcpu, slot, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn, pte_access); + } else { + /* Already rmapped but the pte_access bits may have changed. */ + kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); } return ret; @@ -2728,7 +2850,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, int i, ret; gfn_t gfn; - gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) return -1; @@ -2754,7 +2876,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, WARN_ON(!sp->role.direct); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -2798,20 +2920,42 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, +/* + * Lookup the mapping level for @gfn in the current mm. + * + * WARNING! Use of host_pfn_mapping_level() requires the caller and the end + * consumer to be tied into KVM's handlers for MMU notifier events! + * + * There are several ways to safely use this helper: + * + * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before + * consuming it. In this case, mmu_lock doesn't need to be held during the + * lookup, but it does need to be held while checking the MMU notifier. + * + * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation + * event for the hva. This can be done by explicit checking the MMU notifier + * or by ensuring that KVM already has a valid mapping that covers the hva. + * + * - Do not use the result to install new mappings, e.g. use the host mapping + * level only to decide whether or not to zap an entry. In this case, it's + * not required to hold mmu_lock (though it's highly likely the caller will + * want to hold mmu_lock anyways, e.g. to modify SPTEs). + * + * Note! The lookup can still race with modifications to host page tables, but + * the above "rules" ensure KVM will not _consume_ the result of the walk if a + * race with the primary MMU occurs. + */ +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { + int level = PG_LEVEL_4K; unsigned long hva; unsigned long flags; - int level = PG_LEVEL_4K; pgd_t pgd; p4d_t p4d; pud_t pud; pmd_t pmd; - if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) - return PG_LEVEL_4K; - /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the @@ -2823,16 +2967,19 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, hva = __gfn_to_hva_memslot(slot, gfn); /* - * Lookup the mapping level in the current mm. The information - * may become stale soon, but it is safe to use as long as - * 1) mmu_notifier_retry was checked after taking mmu_lock, and - * 2) mmu_lock is taken now. - * - * We still need to disable IRQs to prevent concurrent tear down - * of page tables. + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. */ local_irq_save(flags); + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_large() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; @@ -2864,7 +3011,7 @@ out: int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level) + int max_level) { struct kvm_lpage_info *linfo; int host_level; @@ -2879,7 +3026,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } @@ -2893,7 +3040,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (unlikely(fault->max_level == PG_LEVEL_4K)) return; - if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + if (is_error_noslot_pfn(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) @@ -2904,8 +3051,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * level, which will be used to do precise, accurate accounting. */ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->pfn, - fault->max_level); + fault->gfn, fault->max_level); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -2961,13 +3107,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (it.level == fault->goal_level) break; - drop_large_spte(vcpu, it.sptep); - if (is_shadow_present_pte(*it.sptep)) + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); + if (sp == ERR_PTR(-EEXIST)) continue; - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - link_shadow_page(vcpu, it.sptep, sp); if (fault->is_tdp && fault->huge_page_disallowed && fault->req_level >= it.level) @@ -3095,7 +3238,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) @@ -3265,7 +3408,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK); if (WARN_ON(!sp)) return; @@ -3369,12 +3512,19 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, - u8 level, bool direct) +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, + u8 level) { + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm_mmu_page *sp; - sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); + role.level = level; + role.quadrant = quadrant; + + WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); + WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); + + sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); @@ -3397,7 +3547,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { @@ -3408,8 +3558,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); - root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), - i << 30, PT32_ROOT_LEVEL, true); + root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, + PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } @@ -3493,9 +3643,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; + int quadrant, i, r; hpa_t root; - unsigned i; - int r; root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; @@ -3533,7 +3682,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) */ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - mmu->root_role.level, false); + mmu->root_role.level); mmu->root.hpa = root; goto set_root_pgd; } @@ -3578,8 +3727,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_gfn = pdptrs[i] >> PAGE_SHIFT; } - root = mmu_alloc_root(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, false); + /* + * If shadowing 32-bit non-PAE page tables, each PAE page + * directory maps one quarter of the guest's non-PAE page + * directory. Othwerise each PAE page direct shadows one guest + * PAE page directory so that quadrant should be 0. + */ + quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; + + root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } @@ -3737,7 +3893,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= PT64_BASE_ADDR_MASK; + root &= SPTE_BASE_ADDR_MASK; sp = to_shadow_page(root); mmu_sync_children(vcpu, sp, true); } @@ -4155,14 +4311,26 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault); int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - while (fault->max_level > PG_LEVEL_4K) { - int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - - if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) - break; + /* + * If the guest's MTRRs may be used to compute the "real" memtype, + * restrict the mapping level to ensure KVM uses a consistent memtype + * across the entire mapping. If the host MTRRs are ignored by TDP + * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA + * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype + * from the guest's MTRRs so that guest accesses to memory that is + * DMA'd aren't cached against the guest's wishes. + * + * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, + * e.g. KVM will force UC memtype for host MMIO. + */ + if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { + int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); + gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - --fault->max_level; + if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) + break; + } } return direct_page_fault(vcpu, fault); @@ -4567,7 +4735,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), - context->root_role.level, false, + context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else @@ -5199,11 +5367,11 @@ static bool need_remote_flush(u64 old, u64 new) return false; if (!is_shadow_present_pte(new)) return true; - if ((old ^ new) & PT64_BASE_ADDR_MASK) + if ((old ^ new) & SPTE_BASE_ADDR_MASK) return true; old ^= shadow_nx_mask; new ^= shadow_nx_mask; - return (old & ~new & PT64_PERM_MASK) != 0; + return (old & ~new & SPTE_PERM_MASK) != 0; } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, @@ -5328,13 +5496,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - /* - * No need to care whether allocation memory is successful - * or not since pte prefetch is skipped if it does not have - * enough objects in the cache. - */ - mmu_topup_memory_caches(vcpu, true); - write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5819,9 +5980,25 @@ int kvm_mmu_init_vm(struct kvm *kvm) node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); + + kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; + kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; + kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; + return 0; } +static void mmu_free_vm_memory_caches(struct kvm *kvm) +{ + kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); +} + void kvm_mmu_uninit_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; @@ -5829,9 +6006,11 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); kvm_mmu_uninit_tdp_mmu(kvm); + + mmu_free_vm_memory_caches(kvm); } -static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; @@ -5853,8 +6032,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (WARN_ON_ONCE(start >= end)) continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - + flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } @@ -5879,7 +6057,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_inc_notifier_count(kvm, gfn_start, gfn_end); - flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); + flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) @@ -5950,15 +6128,249 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } +static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) +{ + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static bool need_topup_split_caches_or_resched(struct kvm *kvm) +{ + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; + + /* + * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed + * to split a single huge page. Calculating how many are actually needed + * is possible but not worth the complexity. + */ + return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || + need_topup(&kvm->arch.split_page_header_cache, 1) || + need_topup(&kvm->arch.split_shadow_page_cache, 1); +} + +static int topup_split_caches(struct kvm *kvm) +{ + /* + * Allocating rmap list entries when splitting huge pages for nested + * MMUs is uncommon as KVM needs to use a list if and only if there is + * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be + * aliased by multiple L2 gfns and/or from multiple nested roots with + * different roles. Aliasing gfns when using TDP is atypical for VMMs; + * a few gfns are often aliased during boot, e.g. when remapping BIOS, + * but aliasing rarely occurs post-boot or for many gfns. If there is + * only one rmap entry, rmap->val points directly at that one entry and + * doesn't need to allocate a list. Buffer the cache by the default + * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM + * encounters an aliased gfn or two. + */ + const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + + KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; + int r; + + lockdep_assert_held(&kvm->slots_lock); + + r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, + SPLIT_DESC_CACHE_MIN_NR_OBJECTS); + if (r) + return r; + + r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); + if (r) + return r; + + return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); +} + +static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + struct shadow_page_caches caches = {}; + union kvm_mmu_page_role role; + unsigned int access; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); + + /* + * Note, huge page splitting always uses direct shadow pages, regardless + * of whether the huge page itself is mapped by a direct or indirect + * shadow page, since the huge page region itself is being directly + * mapped with smaller pages. + */ + role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); + + /* Direct SPs do not require a shadowed_info_cache. */ + caches.page_header_cache = &kvm->arch.split_page_header_cache; + caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; + + /* Safe to pass NULL for vCPU since requesting a direct SP. */ + return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); +} + +static void shadow_mmu_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) + +{ + struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; + u64 huge_spte = READ_ONCE(*huge_sptep); + struct kvm_mmu_page *sp; + bool flush = false; + u64 *sptep, spte; + gfn_t gfn; + int index; + + sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); + + for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { + sptep = &sp->spt[index]; + gfn = kvm_mmu_page_get_gfn(sp, index); + + /* + * The SP may already have populated SPTEs, e.g. if this huge + * page is aliased by multiple sptes with the same access + * permissions. These entries are guaranteed to map the same + * gfn-to-pfn translation since the SP is direct, so no need to + * modify them. + * + * However, if a given SPTE points to a lower level page table, + * that lower level page table may only be partially populated. + * Installing such SPTEs would effectively unmap a potion of the + * huge page. Unmapping guest memory always requires a TLB flush + * since a subsequent operation on the unmapped regions would + * fail to detect the need to flush. + */ + if (is_shadow_present_pte(*sptep)) { + flush |= !is_last_spte(*sptep, sp->role.level); + continue; + } + + spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); + mmu_spte_set(sptep, spte); + __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); + } + + __link_shadow_page(kvm, cache, huge_sptep, sp, flush); +} + +static int shadow_mmu_try_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + int level, r = 0; + gfn_t gfn; + u64 spte; + + /* Grab information for the tracepoint before dropping the MMU lock. */ + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + level = huge_sp->role.level; + spte = *huge_sptep; + + if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { + r = -ENOSPC; + goto out; + } + + if (need_topup_split_caches_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* + * If the topup succeeds, return -EAGAIN to indicate that the + * rmap iterator should be restarted because the MMU lock was + * dropped. + */ + r = topup_split_caches(kvm) ?: -EAGAIN; + write_lock(&kvm->mmu_lock); + goto out; + } + + shadow_mmu_split_huge_page(kvm, slot, huge_sptep); + +out: + trace_kvm_mmu_split_huge_page(gfn, spte, level, r); + return r; +} + +static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) +{ + struct rmap_iterator iter; + struct kvm_mmu_page *sp; + u64 *huge_sptep; + int r; + +restart: + for_each_rmap_spte(rmap_head, &iter, huge_sptep) { + sp = sptep_to_sp(huge_sptep); + + /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ + if (WARN_ON_ONCE(!sp->role.guest_mode)) + continue; + + /* The rmaps should never contain non-leaf SPTEs. */ + if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) + continue; + + /* SPs with level >PG_LEVEL_4K should never by unsync. */ + if (WARN_ON_ONCE(sp->unsync)) + continue; + + /* Don't bother splitting huge pages on invalid SPs. */ + if (sp->role.invalid) + continue; + + r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); + + /* + * The split succeeded or needs to be retried because the MMU + * lock was dropped. Either way, restart the iterator to get it + * back into a consistent state. + */ + if (!r || r == -EAGAIN) + goto restart; + + /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ + break; + } + + return false; +} + +static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level) +{ + int level; + + /* + * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working + * down to the target level. This ensures pages are recursively split + * all the way to the target level. There's no need to split pages + * already at the target level. + */ + for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) { + slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages, + level, level, start, end - 1, true, false); + } +} + /* Must be called with the mmu_lock held in write-mode. */ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level) { - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, - target_level, false); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* * A TLB flush is unnecessary at this point for the same resons as in @@ -5973,12 +6385,19 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (is_tdp_mmu_enabled(kvm)) { - read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); - read_unlock(&kvm->mmu_lock); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + write_unlock(&kvm->mmu_lock); } + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); + read_unlock(&kvm->mmu_lock); + /* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to @@ -5997,13 +6416,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; - kvm_pfn_t pfn; struct kvm_mmu_page *sp; restart: for_each_rmap_spte(rmap_head, &iter, sptep) { sp = sptep_to_sp(sptep); - pfn = spte_to_pfn(*sptep); /* * We cannot do huge page mapping for indirect shadow pages, @@ -6012,10 +6429,10 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - pfn, PG_LEVEL_NUM)) { - pte_list_remove(kvm, rmap_head, sptep); + PG_LEVEL_NUM)) { + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, @@ -6030,18 +6447,24 @@ restart: return need_tlb_flush; } +static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + /* + * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap + * pages that are already mapped at the maximum hugepage level. + */ + if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); +} + void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - /* - * Zap only 4k SPTEs since the legacy MMU only supports dirty - * logging at a 4k granularity and never creates collapsible - * 2m SPTEs during dirty logging. - */ - if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_rmap_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bd2a26897b97..582def531d4d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -20,6 +20,20 @@ extern bool dbg; #define MMU_WARN_ON(x) do { } while (0) #endif +/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ +#define __PT_LEVEL_SHIFT(level, bits_per_level) \ + (PAGE_SHIFT + ((level) - 1) * (bits_per_level)) +#define __PT_INDEX(address, level, bits_per_level) \ + (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1)) + +#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level)) + /* * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT * bit, and thus are guaranteed to be non-zero when valid. And, when a guest @@ -53,8 +67,21 @@ struct kvm_mmu_page { gfn_t gfn; u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; + + /* + * Stores the result of the guest translation being shadowed by each + * SPTE. KVM shadows two types of guest translations: nGPA -> GPA + * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both + * cases the result of the translation is a GPA and a set of access + * constraints. + * + * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed + * access permissions are stored in the lower bits. Note, for + * convenience and uniformity across guests, the access permissions are + * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. + */ + u64 *shadowed_translation; + /* Currently serving as active root */ union { int root_count; @@ -141,9 +168,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) +static inline bool is_nx_huge_page_enabled(struct kvm *kvm) { - return READ_ONCE(nx_huge_pages); + return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages; } struct kvm_page_fault { @@ -242,7 +269,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .user = err & PFERR_USER_MASK, .prefetch = prefetch, .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + .nx_huge_page_workaround_enabled = + is_nx_huge_page_enabled(vcpu->kvm), .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, @@ -281,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level); + int max_level); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h deleted file mode 100644 index de8ab323bb70..000000000000 --- a/arch/x86/kvm/mmu/paging.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* Shadow paging constants/helpers that don't need to be #undef'd. */ -#ifndef __KVM_X86_PAGING_H -#define __KVM_X86_PAGING_H - -#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_LVL_ADDR_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#endif /* __KVM_X86_PAGING_H */ - diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index db80f7ccaa4e..f5958071220c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -16,25 +16,21 @@ */ /* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. + * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, + * as well as guest EPT tables, so the code in this file is compiled thrice, + * once per guest PTE type. The per-type defines are #undef'd at the end. */ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG "cmpxchgq" #else #define PT_MAX_FULL_LEVELS 2 #endif @@ -42,36 +38,35 @@ #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG "cmpxchgl" + + #define PT32_DIR_PSE36_SIZE 4 + #define PT32_DIR_PSE36_SHIFT 13 + #define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) - #ifdef CONFIG_X86_64 - #define CMPXCHG "cmpxchgq" - #endif #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value #endif +/* Common logic, but per-type values. These also need to be undefined. */ +#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) +#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) + #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) @@ -97,6 +92,15 @@ struct guest_walker { struct x86_exception fault; }; +#if PTTYPE == 32 +static inline gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} +#endif + static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; @@ -374,7 +378,7 @@ retry_walk: * information to fix the exit_qualification or exit_info_1 * fields. */ - if (unlikely(real_gpa == UNMAPPED_GVA)) + if (unlikely(real_gpa == INVALID_GPA)) return 0; host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), @@ -421,11 +425,13 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) +#if PTTYPE == 32 + if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); +#endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; walker->gfn = real_gpa >> PAGE_SHIFT; @@ -589,7 +595,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -642,14 +648,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); - drop_large_spte(vcpu, it.sptep); - - sp = NULL; - if (!is_shadow_present_pte(*it.sptep)) { - table_gfn = gw->table_gfn[it.level - 2]; - access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, - it.level-1, false, access); + + table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, + false, access); + + if (sp != ERR_PTR(-EEXIST)) { /* * We must synchronize the pagetable before linking it * because the guest doesn't need to flush tlb when @@ -678,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) goto out_gpte_changed; - if (sp) + if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); } @@ -702,16 +707,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, validate_direct_spte(vcpu, it.sptep, direct_access); - drop_large_spte(vcpu, it.sptep); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, + true, direct_access); + if (sp == ERR_PTR(-EEXIST)) + continue; - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, - it.level - 1, true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); - if (fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + link_shadow_page(vcpu, it.sptep, sp); + if (fault->huge_page_disallowed && + fault->req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } if (WARN_ON_ONCE(it.level != fault->goal_level)) @@ -888,7 +892,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) WARN_ON(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; + offset = sp->role.quadrant << SPTE_LEVEL_BITS; return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } @@ -929,7 +933,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) break; pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + pte_gpa += spte_index(sptep) * sizeof(pt_element_t); mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); if (is_shadow_present_pte(old_spte)) @@ -958,7 +962,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct x86_exception *exception) { struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; + gpa_t gpa = INVALID_GPA; int r; #ifndef CONFIG_X86_64 @@ -978,7 +982,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } /* - * Using the cached information from sp->gfns is safe because: + * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is + * safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. * @@ -1023,7 +1028,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { u64 *sptep, spte; struct kvm_memory_slot *slot; unsigned pte_access; @@ -1053,12 +1058,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) continue; - if (gfn != sp->gfns[i]) { + /* + * Drop the SPTE if the new protections would result in a RWX=0 + * SPTE or if the gfn is changing. The RWX=0 case only affects + * EPT with execute-only support, i.e. EPT without an effective + * "present" bit, as all other paging modes will create a + * read-only SPTE if pte_access is zero. + */ + if ((!pte_access && !shadow_present_mask) || + gfn != kvm_mmu_page_get_gfn(sp, i)) { drop_spte(vcpu->kvm, &sp->spt[i]); flush = true; continue; } + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); + sptep = &sp->spt[i]; spte = *sptep; host_writable = spte & shadow_host_writable_mask; @@ -1070,6 +1086,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) flush |= mmu_spte_update(sptep, spte); } + /* + * Note, any flush is purely for KVM's correctness, e.g. when dropping + * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier + * unmap or dirty logging event doesn't fail to flush. The guest is + * responsible for flushing the TLB to ensure any changes in protection + * bits are recognized, i.e. until the guest flushes or page faults on + * a relevant address, KVM is architecturally allowed to let vCPUs use + * cached translations with the old protection bits. + */ return flush; } @@ -1084,7 +1109,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn #undef gpte_to_gfn_lvl -#undef CMPXCHG #undef PT_GUEST_ACCESSED_MASK #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index b5960bbde7f7..7314d27d57a4 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -129,6 +130,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; + WARN_ON_ONCE(!pte_access && !shadow_present_mask); + if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_mmu_page_ad_need_write_protect(sp)) @@ -145,7 +148,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { + is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } @@ -159,10 +162,10 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + if (shadow_memtype_mask) + spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -244,10 +247,10 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, + int index) { u64 child_spte; - int child_level; if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) return 0; @@ -256,23 +259,23 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) return 0; child_spte = huge_spte; - child_level = huge_level - 1; /* * The child_spte already has the base address of the huge page being * split. So we just have to OR in the offset to the page at the next * lower level for the given index. */ - child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; + child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT; - if (child_level == PG_LEVEL_4K) { + if (role.level == PG_LEVEL_4K) { child_spte &= ~PT_PAGE_SIZE_MASK; /* - * When splitting to a 4K page, mark the page executable as the - * NX hugepage mitigation no longer applies. + * When splitting to a 4K page where execution is allowed, mark + * the page executable as the NX hugepage mitigation no longer + * applies. */ - if (is_nx_huge_page_enabled()) + if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } @@ -299,7 +302,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) { u64 new_spte; - new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte = old_spte & ~SPTE_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; @@ -389,6 +392,13 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + /* + * EPT overrides the host MTRRs, and so KVM must program the desired + * memtype directly into the SPTEs. Note, this mask is just the mask + * of all bits that factor into the memtype, the actual memtype must be + * dynamically calculated, e.g. to ensure host MMIO is mapped UC. + */ + shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -439,6 +449,13 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nx_mask = PT64_NX_MASK; shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; + + /* + * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB + * memtype in the SPTEs, i.e. relies on host MTRRs to provide the + * correct memtype (WB is the "weakest" memtype). + */ + shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0127bb6e3c7d..cabe3fbb4f39 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -36,12 +36,12 @@ extern bool __read_mostly enable_mmio_caching; static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) #else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ +#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 @@ -50,17 +50,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) /* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull +#define SPTE_EPT_READABLE_MASK 0x1ull +#define SPTE_EPT_EXECUTABLE_MASK 0x4ull -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +#define SPTE_LEVEL_BITS 9 +#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) +#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) +#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -69,8 +65,8 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); * restored only when a write is attempted to the page. This mask obviously * must not overlap the A/D type mask. */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ + SPTE_EPT_EXECUTABLE_MASK) #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) @@ -151,6 +147,7 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; @@ -194,6 +191,12 @@ static inline bool is_removed_spte(u64 spte) return spte == REMOVED_SPTE; } +/* Get an SPTE's index into its parent's page table (and the spt array). */ +static inline int spte_index(u64 *sptep) +{ + return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); +} + /* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to @@ -282,7 +285,7 @@ static inline bool is_executable_pte(u64 spte) static inline kvm_pfn_t spte_to_pfn(u64 pte) { - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; } static inline bool is_accessed_spte(u64 spte) @@ -425,7 +428,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index ee4802d7b36c..39b48e7d7d1a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -11,7 +11,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + - SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level); iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } @@ -116,8 +116,8 @@ static bool try_step_side(struct tdp_iter *iter) * Check if the iterator is already at the end of the current page * table. */ - if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == - (PT64_ENT_PER_PAGE - 1)) + if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (SPTE_ENT_PER_PAGE - 1)) return false; iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); @@ -146,15 +146,6 @@ static bool try_step_up(struct tdp_iter *iter) } /* - * Step the iterator back up a level in the paging structure. Should only be - * used when the iterator is below the root level. - */ -void tdp_iter_step_up(struct tdp_iter *iter) -{ - WARN_ON(!try_step_up(iter)); -} - -/* * Step to the next SPTE in a pre-order traversal of the paging structure. * To get to the next SPTE, the iterator either steps down towards the goal * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index adfca0cf94d3..f0af385c56e0 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -114,6 +114,5 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); -void tdp_iter_step_up(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b9265d67131..bf2ccf9debca 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -425,7 +425,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) tdp_mmu_unlink_sp(kvm, sp, shared); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); u64 old_spte; @@ -633,7 +633,6 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, u64 new_spte) { u64 *sptep = rcu_dereference(iter->sptep); - u64 old_spte; /* * The caller is responsible for ensuring the old SPTE is not a REMOVED @@ -649,17 +648,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and * does not hold the mmu_lock. */ - old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); - if (old_spte != iter->old_spte) { - /* - * The page table entry was modified by a different logical - * CPU. Refresh iter->old_spte with the current value so the - * caller operates on fresh data, e.g. if it retries - * tdp_mmu_set_spte_atomic(). - */ - iter->old_spte = old_spte; + if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; - } __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); @@ -934,9 +924,6 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) } /* - * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs - * have been cleared and a TLB flush is needed before releasing the MMU lock. - * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and @@ -979,10 +966,9 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Tears down the mappings for the range of gfns, [start, end), and frees the - * non-root pages mapping GFNs strictly within that range. Returns true if - * SPTEs have been cleared and a TLB flush is needed before releasing the - * MMU lock. + * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns + * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or + * more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, bool can_yield, bool flush) @@ -1487,8 +1473,8 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. */ - for (i = 0; i < PT64_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) + sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level @@ -1507,7 +1493,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * are overwriting from the page stats. But we have to manually update * the page stats with the new present child pages. */ - kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); + kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); out: trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); @@ -1731,10 +1717,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -/* - * Clear leaf entries which could be replaced by large mappings, for - * GFNs within the slot. - */ static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, const struct kvm_memory_slot *slot) @@ -1743,61 +1725,52 @@ static void zap_collapsible_spte_range(struct kvm *kvm, gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; - kvm_pfn_t pfn; rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { +retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || + !is_shadow_present_pte(iter.old_spte)) continue; /* - * This is a leaf SPTE. Check if the PFN it maps can - * be mapped at a higher level. + * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with + * a large page size, then its parent would have been zapped + * instead of stepping down. */ - pfn = spte_to_pfn(iter.old_spte); - - if (kvm_is_reserved_pfn(pfn)) + if (is_last_spte(iter.old_spte, iter.level)) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, pfn, PG_LEVEL_NUM); - - WARN_ON(max_mapping_level < iter.level); - /* - * If this page is already mapped at the highest - * viable level, there's nothing more to do. + * If iter.gfn resides outside of the slot, i.e. the page for + * the current level overlaps but is not contained by the slot, + * then the SPTE can't be made huge. More importantly, trying + * to query that info from slot->arch.lpage_info will cause an + * out-of-bounds access. */ - if (max_mapping_level == iter.level) + if (iter.gfn < start || iter.gfn >= end) continue; - /* - * The page can be remapped at a higher level, so step - * up to zap the parent SPTE. - */ - while (max_mapping_level > iter.level) - tdp_iter_step_up(&iter); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, + iter.gfn, PG_LEVEL_NUM); + if (max_mapping_level < iter.level) + continue; /* Note, a successful atomic zap also does a remote TLB flush. */ - tdp_mmu_zap_spte_atomic(kvm, &iter); - - /* - * If the atomic zap fails, the iter will recurse back into - * the same subtree to retry. - */ + if (tdp_mmu_zap_spte_atomic(kvm, &iter)) + goto retry; } rcu_read_unlock(); } /* - * Clear non-leaf entries (and free associated page tables) which could - * be replaced by large mappings, for GFNs within the slot. + * Zap non-leaf SPTEs (and free their associated page tables) which could + * be replaced by huge pages, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3f868fed9114..02f9e4f245bd 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -16,6 +16,7 @@ #include <linux/bsearch.h> #include <linux/sort.h> #include <asm/perf_event.h> +#include <asm/cpu_device_id.h> #include "x86.h" #include "cpuid.h" #include "lapic.h" @@ -24,6 +25,15 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +struct x86_pmu_capability __read_mostly kvm_pmu_cap; +EXPORT_SYMBOL_GPL(kvm_pmu_cap); + +static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + {} +}; + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. @@ -34,7 +44,9 @@ * However AMD doesn't support fixed-counters; * - There are three types of index to access perf counters (PMC): * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD - * has MSR_K7_PERFCTRn. + * has MSR_K7_PERFCTRn and, for families 15H and later, + * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are + * aliased to MSR_K7_PERFCTRn. * 2. MSR Index (named idx): This normally is used by RDPMC instruction. * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except @@ -46,7 +58,8 @@ * between pmc and perf counters is as the following: * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed - * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters + * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H + * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters */ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; @@ -86,15 +99,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); + bool skip_pmi = false; /* Ignore counters that have been reprogrammed already. */ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) return; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { + /* Indicate PEBS overflow PMI to guest. */ + skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, + (unsigned long *)&pmu->global_status); + } else { + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + } kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - if (!pmc->intr) + if (!pmc->intr || skip_pmi) return; /* @@ -124,6 +144,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; struct perf_event_attr attr = { .type = type, @@ -135,9 +156,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; - - if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) - return; + bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); attr.sample_period = get_sample_period(pmc, pmc->counter); @@ -150,6 +169,25 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, */ attr.sample_period = 0; } + if (pebs) { + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + * + * For most PEBS hardware events, the difference in the software + * precision levels of guest and host PEBS events will not affect + * the accuracy of the PEBS profiling result, because the "event IP" + * in the PEBS record is calibrated on the guest side. + * + * On Icelake everything is fine. Other hardware (GLC+, TNT+) that + * could possibly care here is unsupported and needs changes. + */ + attr.precise_ip = 1; + if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) + attr.precise_ip = 3; + } event = perf_event_create_kernel_counter(&attr, -1, current, kvm_perf_overflow, pmc); @@ -163,7 +201,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; - pmc->intr = intr; + pmc->intr = intr || pebs; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -189,6 +227,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter))) return false; + if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) && + pmc->perf_event->attr.precise_ip) + return false; + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); pmc->is_paused = false; @@ -205,115 +247,83 @@ static int cmp_u64(const void *pa, const void *pb) return (a > b) - (a < b); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +static bool check_pmu_event_filter(struct kvm_pmc *pmc) { - u64 config; - u32 type = PERF_TYPE_RAW; - struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; - struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); + struct kvm *kvm = pmc->vcpu->kvm; bool allow_event = true; + __u64 key; + int idx; - if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) - printk_once("kvm pmu: pin control bit is ignored\n"); - - pmc->eventsel = eventsel; - - pmc_pause_counter(pmc); - - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) - return; + if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) + return false; filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; + if (!filter) + goto out; + if (pmc_is_gp(pmc)) { + key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; if (bsearch(&key, filter->events, filter->nevents, sizeof(__u64), cmp_u64)) allow_event = filter->action == KVM_PMU_EVENT_ALLOW; else allow_event = filter->action == KVM_PMU_EVENT_DENY; + } else { + idx = pmc->idx - INTEL_PMC_IDX_FIXED; + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; } - if (!allow_event) - return; - - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK | - HSW_IN_TX | - HSW_IN_TX_CHECKPOINTED))) { - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - if (config != PERF_COUNT_HW_MAX) - type = PERF_TYPE_HARDWARE; - } - - if (type == PERF_TYPE_RAW) - config = eventsel & pmu->raw_event_mask; - - if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) - return; - - pmc_release_perf_event(pmc); - pmc->current_config = eventsel; - pmc_reprogram_counter(pmc, type, config, - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); +out: + return allow_event; } -EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) +void reprogram_counter(struct kvm_pmc *pmc) { - unsigned en_field = ctrl & 0x3; - bool pmi = ctrl & 0x8; - struct kvm_pmu_event_filter *filter; - struct kvm *kvm = pmc->vcpu->kvm; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 eventsel = pmc->eventsel; + u64 new_config = eventsel; + u8 fixed_ctr_ctrl; pmc_pause_counter(pmc); - if (!en_field || !pmc_is_enabled(pmc)) + if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) return; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - } - - if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) + if (!check_pmu_event_filter(pmc)) return; - pmc_release_perf_event(pmc); - - pmc->current_config = (u64)ctrl; - pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc), - !(en_field & 0x2), /* exclude user */ - !(en_field & 0x1), /* exclude kernel */ - pmi); -} -EXPORT_SYMBOL_GPL(reprogram_fixed_counter); + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) -{ - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx); + if (pmc_is_fixed(pmc)) { + fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); + if (fixed_ctr_ctrl & 0x1) + eventsel |= ARCH_PERFMON_EVENTSEL_OS; + if (fixed_ctr_ctrl & 0x2) + eventsel |= ARCH_PERFMON_EVENTSEL_USR; + if (fixed_ctr_ctrl & 0x8) + eventsel |= ARCH_PERFMON_EVENTSEL_INT; + new_config = (u64)fixed_ctr_ctrl; + } - if (!pmc) + if (pmc->current_config == new_config && pmc_resume_counter(pmc)) return; - if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc, pmc->eventsel); - else { - int idx = pmc_idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); + pmc_release_perf_event(pmc); - reprogram_fixed_counter(pmc, ctrl, idx); - } + pmc->current_config = new_config; + pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT); } EXPORT_SYMBOL_GPL(reprogram_counter); @@ -329,8 +339,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) clear_bit(bit, pmu->reprogram_pmi); continue; } - - reprogram_counter(pmu, bit); + reprogram_counter(pmc); } /* @@ -471,17 +480,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (pmc_is_fixed(pmc)) - return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; - - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; -} - /* Release perf_events for vPMCs that have been unused for a full time slice. */ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) { @@ -514,13 +512,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 prev_count; prev_count = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - reprogram_counter(pmu, pmc->idx); + reprogram_counter(pmc); if (pmc->counter < prev_count) __kvm_perf_overflow(pmc, false); } @@ -528,13 +525,8 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, unsigned int perf_hw_id) { - u64 old_eventsel = pmc->eventsel; - unsigned int config; - - pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - pmc->eventsel = old_eventsel; - return config == perf_hw_id; + return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & + AMD64_RAW_EVENT_MASK_NB); } static inline bool cpl_is_matched(struct kvm_pmc *pmc) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index e745f443b6a8..5cc5721f260b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -8,6 +8,9 @@ #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) +#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) @@ -22,7 +25,7 @@ struct kvm_event_hw_type_mapping { }; struct kvm_pmu_ops { - unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); + bool (*hw_event_available)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, @@ -144,9 +147,43 @@ static inline void pmc_update_sample_period(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter)); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + +extern struct x86_pmu_capability kvm_pmu_cap; + +static inline void kvm_init_pmu_capability(void) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * For Intel, only support guest architectural pmu + * on a host with architectural pmu. + */ + if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) + enable_pmu = false; + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_PMC_MAX_FIXED); +} + +void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d1bc5820ea46..6919dee69f18 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -40,6 +40,9 @@ #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +static bool force_avic; +module_param_unsafe(force_avic, bool, 0444); + /* Note: * This hash table is used to map VM_ID to a struct kvm_svm, * when handling AMD IOMMU GALOG notification to schedule in @@ -50,6 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); +enum avic_modes avic_mode; /* * This is a wrapper of struct amd_iommu_ir_data. @@ -59,6 +63,54 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; +static void avic_activate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + + /* Note: + * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC + * MSR accesses, while interrupt injection to a running vCPU + * can be achieved using AVIC doorbell. The AVIC hardware still + * accelerate MMIO accesses, but this does not cause any harm + * as the guest is not supposed to access xAPIC mmio when uses x2APIC. + */ + if (apic_x2apic_mode(svm->vcpu.arch.apic) && + avic_mode == AVIC_MODE_X2) { + vmcb->control.int_ctl |= X2APIC_MODE_MASK; + vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + /* Disabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, false); + } else { + /* For xAVIC and hybrid-xAVIC modes */ + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); + } +} + +static void avic_deactivate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + /* + * If running nested and the guest uses its own MSR bitmap, there + * is no need to update L0's msr bitmap + */ + if (is_guest_mode(&svm->vcpu) && + vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) + return; + + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); +} /* Note: * This function is called from IOMMU driver to notify @@ -175,13 +227,12 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; if (kvm_apicv_activated(svm->vcpu.kvm)) - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); else - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, @@ -190,7 +241,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, u64 *avic_physical_id_table; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID)) return NULL; avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); @@ -237,7 +289,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID)) return -EINVAL; if (!vcpu->arch.apic->regs) @@ -279,8 +332,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) */ int cpu = READ_ONCE(vcpu->cpu); - if (cpu != get_cpu()) + if (cpu != get_cpu()) { wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); + } put_cpu(); } @@ -303,7 +358,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (apic_x2apic_mode(source)) dest = icrh; else - dest = GET_APIC_DEST_FIELD(icrh); + dest = GET_XAPIC_DEST_FIELD(icrh); if (dest_mode == APIC_DEST_PHYSICAL) { /* broadcast destination, use slow path */ @@ -345,9 +400,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source logid_index = cluster + __ffs(bitmap); - if (apic_x2apic_mode(source)) { - l1_physical_id = logid_index; - } else { + if (!apic_x2apic_mode(source)) { u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); @@ -362,6 +415,23 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source l1_physical_id = logid_entry & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } else { + /* + * For x2APIC logical mode, cannot leverage the index. + * Instead, calculate physical ID from logical ID in ICRH. + */ + int cluster = (icrh & 0xffff0000) >> 16; + int apic = ffs(icrh & 0xffff) - 1; + + /* + * If the x2APIC logical ID sub-field (i.e. icrh[15:0]) + * contains anything but a single bit, we cannot use the + * fast path, because it is limited to a single vCPU. + */ + if (apic < 0 || icrh != (1 << apic)) + return -EINVAL; + + l1_physical_id = (cluster << 4) + apic; } } @@ -396,9 +466,15 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, * since entered the guest will have processed pending IRQs at VMRUN. */ kvm_for_each_vcpu(i, vcpu, kvm) { + u32 dest; + + if (apic_x2apic_mode(vcpu->arch.apic)) + dest = icrh; + else + dest = GET_XAPIC_DEST_FIELD(icrh); + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK)) { + dest, icrl & APIC_DEST_MASK)) { vcpu->arch.apic->irr_pending = true; svm_complete_interrupt_delivery(vcpu, icrl & APIC_MODE_MASK, @@ -514,8 +590,13 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool flat = svm->dfr_reg == APIC_DFR_FLAT; - u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + u32 *entry; + + /* Note: x2AVIC does not use logical APIC ID table */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return; + entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); if (entry) clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } @@ -527,6 +608,10 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); u32 id = kvm_xapic_id(vcpu->arch.apic); + /* AVIC does not support LDR update for x2APIC */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return 0; + if (ldr == svm->ldr_reg) return 0; @@ -654,6 +739,18 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) + return; + + if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) { + WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id); + return; + } + avic_refresh_apicv_exec_ctrl(vcpu); +} + static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; @@ -906,7 +1003,6 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | BIT(APICV_INHIBIT_REASON_SEV) | BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | @@ -968,7 +1064,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); @@ -1016,9 +1111,9 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) * accordingly before re-activating. */ avic_apicv_post_state_restore(vcpu); - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); } else { - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } vmcb_mark_dirty(vmcb, VMCB_AVIC); @@ -1058,3 +1153,44 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_vcpu_load(vcpu, vcpu->cpu); } + +/* + * Note: + * - The module param avic enable both xAPIC and x2APIC mode. + * - Hypervisor can support both xAVIC and x2AVIC in the same guest. + * - The mode can be switched at run-time. + */ +bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +{ + if (!npt_enabled) + return false; + + if (boot_cpu_has(X86_FEATURE_AVIC)) { + avic_mode = AVIC_MODE_X1; + pr_info("AVIC enabled\n"); + } else if (force_avic) { + /* + * Some older systems does not advertise AVIC support. + * See Revision Guide for specific AMD processor for more detail. + */ + avic_mode = AVIC_MODE_X1; + pr_warn("AVIC is not supported in CPUID but force enabled"); + pr_warn("Your system might crash and burn"); + } + + /* AVIC is a prerequisite for x2AVIC. */ + if (boot_cpu_has(X86_FEATURE_X2AVIC)) { + if (avic_mode == AVIC_MODE_X1) { + avic_mode = AVIC_MODE_X2; + pr_info("x2AVIC enabled\n"); + } else { + pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); + pr_warn(FW_BUG "Try enable AVIC using force_avic option"); + } + } + + if (avic_mode != AVIC_MODE_NONE) + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + + return !!avic_mode; +} diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ba7cd26f438f..76dcc8a3e849 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -230,6 +230,11 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) break; p = msrpm_offsets[i]; + + /* x2apic msrs are intercepted always for the nested guest */ + if (is_x2apic_msrpm_offset(p)) + continue; + offset = svm->nested.ctl.msrpm_base_pa + (p * 4); if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) @@ -320,7 +325,8 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, return false; } - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + /* Note, SVM doesn't have any additional restrictions on CR4. */ + if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4))) return false; if (CC(!kvm_valid_efer(vcpu, save->efer))) @@ -371,6 +377,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->nested_ctl = from->nested_ctl; to->event_inj = from->event_inj; to->event_inj_err = from->event_inj_err; + to->next_rip = from->next_rip; to->nested_cr3 = from->nested_cr3; to->virt_ext = from->virt_ext; to->pause_filter_count = from->pause_filter_count; @@ -608,7 +615,33 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 } } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) +static inline bool is_evtinj_soft(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + if (type == SVM_EVTINJ_TYPE_SOFT) + return true; + + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); +} + +static bool is_evtinj_nmi(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + return type == SVM_EVTINJ_TYPE_NMI; +} + +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, + unsigned long vmcb12_rip, + unsigned long vmcb12_csbase) { u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; @@ -650,7 +683,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); nested_svm_update_tsc_ratio_msr(vcpu); } @@ -664,6 +697,30 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.event_inj = svm->nested.ctl.event_inj; vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; + /* + * next_rip is consumed on VMRUN as the return address pushed on the + * stack for injected soft exceptions/interrupts. If nrips is exposed + * to L1, take it verbatim from vmcb12. If nrips is supported in + * hardware but not exposed to L1, stuff the actual L2 RIP to emulate + * what a nrips=0 CPU would do (L1 is responsible for advancing RIP + * prior to injecting the event). + */ + if (svm->nrips_enabled) + vmcb02->control.next_rip = svm->nested.ctl.next_rip; + else if (boot_cpu_has(X86_FEATURE_NRIPS)) + vmcb02->control.next_rip = vmcb12_rip; + + svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); + if (is_evtinj_soft(vmcb02->control.event_inj)) { + svm->soft_int_injected = true; + svm->soft_int_csbase = vmcb12_csbase; + svm->soft_int_old_rip = vmcb12_rip; + if (svm->nrips_enabled) + svm->soft_int_next_rip = svm->nested.ctl.next_rip; + else + svm->soft_int_next_rip = vmcb12_rip; + } + vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; if (svm->lbrv_enabled) @@ -745,7 +802,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, @@ -834,6 +891,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) out_exit_err: svm->nested.nested_run_pending = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; svm->vmcb->control.exit_code_hi = 0; @@ -982,7 +1041,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -1421,6 +1480,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->nested_ctl = from->nested_ctl; dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; + dst->next_rip = from->next_rip; dst->nested_cr3 = from->nested_cr3; dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; @@ -1605,7 +1665,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); /* * While the nested guest CR3 is already checked and set by diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 136039fc6d01..f24613a108c5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -33,34 +33,6 @@ enum index { INDEX_ERROR, }; -/* duplicated from amd_perfmon_event_map, K7 and above should work. */ -static struct kvm_event_hw_type_mapping amd_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* duplicated from amd_f17h_perfmon_event_map. */ -static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* amd_pmc_perf_hw_id depends on these being the same size */ -static_assert(ARRAY_SIZE(amd_event_mapping) == - ARRAY_SIZE(amd_f17h_event_mapping)); - static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); @@ -154,31 +126,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, return &pmu->gp_counters[msr_to_index(msr)]; } -static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool amd_hw_event_available(struct kvm_pmc *pmc) { - struct kvm_event_hw_type_mapping *event_mapping; - u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - int i; - - /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ - if (WARN_ON(pmc_is_fixed(pmc))) - return PERF_COUNT_HW_MAX; - - if (guest_cpuid_family(pmc->vcpu) >= 0x17) - event_mapping = amd_f17h_event_mapping; - else - event_mapping = amd_event_mapping; - - for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) - if (event_mapping[i].eventsel == event_select - && event_mapping[i].unit_mask == unit_mask) - break; - - if (i == ARRAY_SIZE(amd_event_mapping)) - return PERF_COUNT_HW_MAX; - - return event_mapping[i].event_type; + return true; } /* check if a PMC is enabled by comparing it against global_ctrl bits. Because @@ -286,8 +236,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { data &= ~pmu->reserved_bits; - if (data != pmc->eventsel) - reprogram_gp_counter(pmc, data); + if (data != pmc->eventsel) { + pmc->eventsel = data; + reprogram_counter(pmc); + } return 0; } @@ -343,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops amd_pmu_ops __initdata = { - .pmc_perf_hw_id = amd_pmc_perf_hw_id, + .hw_event_available = amd_hw_event_available, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0c240ed04f96..b0e793e7d85c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -603,6 +603,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; + pr_debug("Virtual Machine Save Area (VMSA):\n"); + print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); + return 0; } @@ -1606,38 +1609,35 @@ static int sev_lock_vcpus_for_migration(struct kvm *kvm, { struct kvm_vcpu *vcpu; unsigned long i, j; - bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; - if (first) { +#ifdef CONFIG_PROVE_LOCKING + if (!i) /* * Reset the role to one that avoids colliding with * the role used for the first vcpu mutex. */ role = SEV_NR_MIGRATION_ROLES; - first = false; - } else { + else mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); - } +#endif } return 0; out_unlock: - first = true; kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; - if (first) - first = false; - else +#ifdef CONFIG_PROVE_LOCKING + if (j) mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); - +#endif mutex_unlock(&vcpu->mutex); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 44bbf25dfeb9..38f873cb6f2c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -74,6 +74,8 @@ static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) + static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is initially cleared */ @@ -100,6 +102,38 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, { .index = MSR_TSC_AUX, .always = false }, + { .index = X2APIC_MSR(APIC_ID), .always = false }, + { .index = X2APIC_MSR(APIC_LVR), .always = false }, + { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, + { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, + { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, + { .index = X2APIC_MSR(APIC_EOI), .always = false }, + { .index = X2APIC_MSR(APIC_RRR), .always = false }, + { .index = X2APIC_MSR(APIC_LDR), .always = false }, + { .index = X2APIC_MSR(APIC_DFR), .always = false }, + { .index = X2APIC_MSR(APIC_SPIV), .always = false }, + { .index = X2APIC_MSR(APIC_ISR), .always = false }, + { .index = X2APIC_MSR(APIC_TMR), .always = false }, + { .index = X2APIC_MSR(APIC_IRR), .always = false }, + { .index = X2APIC_MSR(APIC_ESR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR2), .always = false }, + + /* + * Note: + * AMD does not virtualize APIC TSC-deadline timer mode, but it is + * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, + * the AVIC hardware would generate GP fault. Therefore, always + * intercept the MSR 0x832, and do not setup direct_access_msr. + */ + { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, + { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, + { .index = X2APIC_MSR(APIC_LVT0), .always = false }, + { .index = X2APIC_MSR(APIC_LVT1), .always = false }, + { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, + { .index = X2APIC_MSR(APIC_TMICT), .always = false }, + { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, + { .index = X2APIC_MSR(APIC_TDCR), .always = false }, { .index = MSR_INVALID, .always = false }, }; @@ -188,9 +222,6 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); -static bool force_avic; -module_param_unsafe(force_avic, bool, 0444); - bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -342,9 +373,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned long old_rflags; /* * SEV-ES does not expose the next RIP. The RIP update is controlled by @@ -359,18 +392,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { + if (unlikely(!commit_side_effects)) + old_rflags = svm->vmcb->save.rflags; + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; + + if (unlikely(!commit_side_effects)) + svm->vmcb->save.rflags = old_rflags; } else { kvm_rip_write(vcpu, svm->next_rip); } done: - svm_set_interrupt_shadow(vcpu, 0); + if (likely(commit_side_effects)) + svm_set_interrupt_shadow(vcpu, 0); return 1; } +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + return __svm_skip_emulated_instruction(vcpu, true); +} + +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +{ + unsigned long rip, old_rip = kvm_rip_read(vcpu); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * Due to architectural shortcomings, the CPU doesn't always provide + * NextRIP, e.g. if KVM intercepted an exception that occurred while + * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip + * the instruction even if NextRIP is supported to acquire the next + * RIP so that it can be shoved into the NextRIP field, otherwise + * hardware will fail to advance guest RIP during event injection. + * Drop the exception/interrupt if emulation fails and effectively + * retry the instruction, it's the least awful option. If NRIPS is + * in use, the skip must not commit any side effects such as clearing + * the interrupt shadow or RFLAGS.RF. + */ + if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + return -EIO; + + rip = kvm_rip_read(vcpu); + + /* + * Save the injection information, even when using next_rip, as the + * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection + * doesn't complete due to a VM-Exit occurring while the CPU is + * vectoring the event. Decoding the instruction isn't guaranteed to + * work as there may be no backing instruction, e.g. if the event is + * being injected by L1 for L2, or if the guest is patching INT3 into + * a different instruction. + */ + svm->soft_int_injected = true; + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = old_rip; + svm->soft_int_next_rip = rip; + + if (nrips) + kvm_rip_write(vcpu, old_rip); + + if (static_cpu_has(X86_FEATURE_NRIPS)) + svm->vmcb->control.next_rip = rip; + + return 0; +} + static void svm_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -380,21 +470,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu); - if (nr == BP_VECTOR && !nrips) { - unsigned long rip, old_rip = kvm_rip_read(vcpu); - - /* - * For guest debugging where we have to reinject #BP if some - * INT3 is guest-owned: - * Emulate nRIP by moving RIP forward. Will fail if injection - * raises a fault that is not intercepted. Still better than - * failing in all cases. - */ - (void)svm_skip_emulated_instruction(vcpu); - rip = kvm_rip_read(vcpu); - svm->int3_rip = rip + svm->vmcb->save.cs.base; - svm->int3_injected = rip - old_rip; - } + if (kvm_exception_is_soft(nr) && + svm_update_soft_interrupt_rip(vcpu)) + return; svm->vmcb->control.event_inj = nr | SVM_EVTINJ_VALID @@ -736,6 +814,29 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) } } +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) +{ + int i; + + if (intercept == svm->x2avic_msrs_intercepted) + return; + + if (avic_mode != AVIC_MODE_X2 || + !apic_x2apic_mode(svm->vcpu.arch.apic)) + return; + + for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { + int index = direct_access_msrs[i].index; + + if ((index < APIC_BASE_MSR) || + (index > APIC_BASE_MSR + 0xff)) + continue; + set_msr_interception(&svm->vcpu, svm->msrpm, index, + !intercept, !intercept); + } + + svm->x2avic_msrs_intercepted = intercept; +} void svm_vcpu_free_msrpm(u32 *msrpm) { @@ -1231,7 +1332,7 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; - svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; if (sev_es_guest(vcpu->kvm)) sev_es_vcpu_reset(svm); @@ -1299,6 +1400,8 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) goto error_free_vmsa_page; } + svm->x2avic_msrs_intercepted = true; + svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); @@ -2345,6 +2448,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) kvm_clear_exception_queue(vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: + case SVM_EXITINTINFO_TYPE_SOFT: kvm_clear_interrupt_queue(vcpu); break; default: @@ -3375,35 +3479,49 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + + if (svm->nmi_l1_to_l2) + return; + vcpu->arch.hflags |= HF_NMI_MASK; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } -static void svm_inject_irq(struct kvm_vcpu *vcpu) +static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_svm *svm = to_svm(vcpu); + u32 type; + + if (vcpu->arch.interrupt.soft) { + if (svm_update_soft_interrupt_rip(vcpu)) + return; - BUG_ON(!(gif_set(svm))); + type = SVM_EVTINJ_TYPE_SOFT; + } else { + type = SVM_EVTINJ_TYPE_INTR; + } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr, + vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; + SVM_EVTINJ_VALID | type; } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vector) { /* - * vcpu->arch.apicv_active must be read after vcpu->mode. + * apic->apicv_active must be read after vcpu->mode. * Pairs with smp_store_release in vcpu_enter_guest. */ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); - if (!READ_ONCE(vcpu->arch.apicv_active)) { + /* Note, this is called iff the local APIC is in-kernel. */ + if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { /* Process the interrupt via inject_pending_event */ kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -3668,15 +3786,49 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, + int type) +{ + bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); + bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's + * associated with the original soft exception/interrupt. next_rip is + * cleared on all exits that can occur while vectoring an event, so KVM + * needs to manually set next_rip for re-injection. Unlike the !nrips + * case below, this needs to be done if and only if KVM is re-injecting + * the same event, i.e. if the event is a soft exception/interrupt, + * otherwise next_rip is unused on VMRUN. + */ + if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && + kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) + svm->vmcb->control.next_rip = svm->soft_int_next_rip; + /* + * If NRIPS isn't enabled, KVM must manually advance RIP prior to + * injecting the soft exception/interrupt. That advancement needs to + * be unwound if vectoring didn't complete. Note, the new event may + * not be the injected event, e.g. if KVM injected an INTn, the INTn + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will + * be the reported vectored event, but RIP still needs to be unwound. + */ + else if (!nrips && (is_soft || is_exception) && + kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) + kvm_rip_write(vcpu, svm->soft_int_old_rip); +} + static void svm_complete_interrupts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; - unsigned int3_injected = svm->int3_injected; + bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; + bool soft_int_injected = svm->soft_int_injected; - svm->int3_injected = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; /* * If we've made progress since setting HF_IRET_MASK, we've @@ -3701,9 +3853,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + if (soft_int_injected) + svm_complete_soft_interrupt(vcpu, vector, type); + switch (type) { case SVM_EXITINTINFO_TYPE_NMI: vcpu->arch.nmi_injected = true; + svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; case SVM_EXITINTINFO_TYPE_EXEPT: /* @@ -3712,18 +3868,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) if (vector == X86_TRAP_VC) break; - /* - * In case of software exceptions, do not reinject the vector, - * but re-execute the instruction instead. Rewind RIP first - * if we emulated INT3 before. - */ - if (kvm_exception_is_soft(vector)) { - if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(vcpu, svm->int3_rip)) - kvm_rip_write(vcpu, - kvm_rip_read(vcpu) - int3_injected); - break; - } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; kvm_requeue_exception_e(vcpu, vector, err); @@ -3734,9 +3878,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; + case SVM_EXITINTINFO_TYPE_SOFT: + kvm_queue_interrupt(vcpu, vector, true); + break; default: break; } + } static void svm_cancel_injection(struct kvm_vcpu *vcpu) @@ -3952,7 +4100,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); cr3 = vcpu->arch.cr3; - } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) { + } else if (root_level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); } else { /* PCID in the guest should be impossible with a 32-bit MMU. */ @@ -4013,16 +4161,10 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; - struct kvm *kvm = vcpu->kvm; vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVE) && @@ -4049,19 +4191,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } - if (kvm_vcpu_apicv_active(vcpu)) { - /* - * AVIC does not work with an x2APIC mode guest. If the X2APIC feature - * is exposed to the guest, disable AVIC. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) - kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); - } init_vmcb_after_set_cpuid(vcpu); } @@ -4673,11 +4807,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + .set_virtual_apic_mode = avic_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, - .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -4773,7 +4907,7 @@ static __init void svm_set_cpu_caps(void) { kvm_set_cpu_caps(); - supported_xss = 0; + kvm_caps.supported_xss = 0; /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { @@ -4849,7 +4983,8 @@ static __init int svm_hardware_setup(void) init_msrpm_offsets(); - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); @@ -4859,11 +4994,11 @@ static __init int svm_hardware_setup(void) tsc_scaling = false; } else { pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; } } - kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; + kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 32; tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); @@ -4917,17 +5052,9 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic); + enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); - if (enable_apicv) { - if (!boot_cpu_has(X86_FEATURE_AVIC)) { - pr_warn("AVIC is not supported in CPUID but force enabled"); - pr_warn("Your system might crash and burn"); - } else - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } else { + if (!enable_apicv) { svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9223ac100ef5..6a7686bf6900 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -29,13 +29,21 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 21 -#define MSRPM_OFFSETS 16 +#define MAX_DIRECT_ACCESS_MSRS 46 +#define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int vgif; extern bool intercept_smi; +enum avic_modes { + AVIC_MODE_NONE = 0, + AVIC_MODE_X1, + AVIC_MODE_X2, +}; + +extern enum avic_modes avic_mode; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to @@ -139,6 +147,7 @@ struct vmcb_ctrl_area_cached { u64 nested_ctl; u32 event_inj; u32 event_inj_err; + u64 next_rip; u64 nested_cr3; u64 virt_ext; u32 clean; @@ -228,9 +237,12 @@ struct vcpu_svm { bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; + bool nmi_l1_to_l2; - unsigned int3_injected; - unsigned long int3_rip; + unsigned long soft_int_csbase; + unsigned long soft_int_old_rip; + unsigned long soft_int_next_rip; + bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ bool nrips_enabled : 1; @@ -264,6 +276,8 @@ struct vcpu_svm { struct vcpu_sev_es_state sev_es; bool guest_state_loaded; + + bool x2avic_msrs_intercepted; }; struct svm_cpu_data { @@ -509,6 +523,15 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } +static inline bool is_x2apic_msrpm_offset(u32 offset) +{ + /* 4 msrs per u8, and 4 u8 in u32 */ + u32 msr = offset * 16; + + return (msr >= APIC_BASE_MSR) && + (msr < (APIC_BASE_MSR + 0x100)); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU @@ -534,6 +557,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); @@ -603,6 +627,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ +bool avic_hardware_setup(struct kvm_x86_ops *ops); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -613,18 +638,16 @@ int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); -void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); + /* sev.c */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index de4762517569..2120d7c060a9 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall, TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count, void *data), + unsigned int count, const void *data), TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( @@ -333,18 +333,24 @@ TRACE_EVENT_KVM_EXIT(kvm_exit); * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_virq, - TP_PROTO(unsigned int irq), - TP_ARGS(irq), + TP_PROTO(unsigned int vector, bool soft, bool reinjected), + TP_ARGS(vector, soft, reinjected), TP_STRUCT__entry( - __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( bool, soft ) + __field( bool, reinjected ) ), TP_fast_assign( - __entry->irq = irq; + __entry->vector = vector; + __entry->soft = soft; + __entry->reinjected = reinjected; ), - TP_printk("irq %u", __entry->irq) + TP_printk("%s 0x%x%s", + __entry->soft ? "Soft/INTn" : "IRQ", __entry->vector, + __entry->reinjected ? " [reinjected]" : "") ); #define EXS(x) { x##_VECTOR, "#" #x } @@ -358,25 +364,30 @@ TRACE_EVENT(kvm_inj_virq, * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_exception, - TP_PROTO(unsigned exception, bool has_error, unsigned error_code), - TP_ARGS(exception, has_error, error_code), + TP_PROTO(unsigned exception, bool has_error, unsigned error_code, + bool reinjected), + TP_ARGS(exception, has_error, error_code, reinjected), TP_STRUCT__entry( __field( u8, exception ) __field( u8, has_error ) __field( u32, error_code ) + __field( bool, reinjected ) ), TP_fast_assign( __entry->exception = exception; __entry->has_error = has_error; __entry->error_code = error_code; + __entry->reinjected = reinjected; ), - TP_printk("%s (0x%x)", + TP_printk("%s%s%s%s%s", __print_symbolic(__entry->exception, kvm_trace_sym_exc), - /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0) + !__entry->has_error ? "" : " (", + !__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }), + !__entry->has_error ? "" : ")", + __entry->reinjected ? " [reinjected]" : "") ); /* @@ -1479,6 +1490,24 @@ TRACE_EVENT(kvm_avic_kick_vcpu_slowpath, __entry->icrh, __entry->icrl, __entry->index) ); +TRACE_EVENT(kvm_avic_doorbell, + TP_PROTO(u32 vcpuid, u32 apicid), + TP_ARGS(vcpuid, apicid), + + TP_STRUCT__entry( + __field(u32, vcpuid) + __field(u32, apicid) + ), + + TP_fast_assign( + __entry->vcpuid = vcpuid; + __entry->apicid = apicid; + ), + + TP_printk("vcpuid=%u, apicid=%u", + __entry->vcpuid, __entry->apicid) +); + TRACE_EVENT(kvm_hv_timer_state, TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), TP_ARGS(vcpu_id, hv_timer_in_use), diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index c0e24826a86f..c5e5dfef69c7 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -6,6 +6,8 @@ #include "../lapic.h" #include "../x86.h" +#include "../pmu.h" +#include "../cpuid.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -13,6 +15,7 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; +extern bool __read_mostly enable_ipiv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 @@ -59,6 +62,7 @@ struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; + u64 cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; struct nested_vmx_msrs nested; @@ -94,20 +98,17 @@ static inline bool cpu_has_vmx_posted_intr(void) static inline bool cpu_has_load_ia32_efer(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER; } static inline bool cpu_has_load_perf_global_ctrl(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; } static inline bool cpu_has_vmx_mpx(void) { - return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && - (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; } static inline bool cpu_has_vmx_tpr_shadow(void) @@ -131,6 +132,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void) CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } +static inline bool cpu_has_tertiary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; +} + static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -276,6 +283,11 @@ static inline bool cpu_has_vmx_apicv(void) cpu_has_vmx_posted_intr(); } +static inline bool cpu_has_vmx_ipiv(void) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -363,7 +375,6 @@ static inline bool cpu_has_vmx_intel_pt(void) rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && - (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } @@ -385,23 +396,31 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } +static inline bool vmx_pebs_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; +} + static inline u64 vmx_get_perf_capabilities(void) { - u64 perf_cap = 0; + u64 perf_cap = PMU_CAP_FW_WRITES; + u64 host_perf_cap = 0; if (!enable_pmu) - return perf_cap; + return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - perf_cap &= PMU_CAP_LBR_FMT; + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - /* - * Since counters are virtualized, KVM would support full - * width counting unconditionally, even if the host lacks it. - */ - return PMU_CAP_FW_WRITES | perf_cap; + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; } static inline u64 vmx_supported_debugctl(void) @@ -417,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void) return debugctl; } +static inline bool cpu_has_notify_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_NOTIFY_VM_EXITING; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 87e3dc10edf4..6a61b1ae7942 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -297,8 +297,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); #if IS_ENABLED(CONFIG_HYPERV) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { + vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL; vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmcs_conf->cpu_based_3rd_exec_ctrl = 0; vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 8d70f9aea94b..f886a8ff0342 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -50,6 +50,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); */ #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) #define EVMCS1_UNSUPPORTED_2NDEXEC \ (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ab135f9ef52f..ddd4367d4826 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -311,11 +311,12 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* Unpin physical memory we referred to in the vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + /* + * Unpin physical memory we referred to in the vmcs02. The APIC access + * page's backing page (yeah, confusing) shouldn't actually be accessed, + * and if it is written, the contents are irrelevant. + */ + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; @@ -1223,7 +1224,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | /* reserved */ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.msrs.basic; + u64 vmx_basic = vmcs_config.nested.basic; if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) return -EINVAL; @@ -1246,36 +1247,42 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) return 0; } -static int -vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, + u32 **low, u32 **high) { - u64 supported; - u32 *lowp, *highp; - switch (msr_index) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.msrs.pinbased_ctls_low; - highp = &vmx->nested.msrs.pinbased_ctls_high; + *low = &msrs->pinbased_ctls_low; + *high = &msrs->pinbased_ctls_high; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.msrs.procbased_ctls_low; - highp = &vmx->nested.msrs.procbased_ctls_high; + *low = &msrs->procbased_ctls_low; + *high = &msrs->procbased_ctls_high; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.msrs.exit_ctls_low; - highp = &vmx->nested.msrs.exit_ctls_high; + *low = &msrs->exit_ctls_low; + *high = &msrs->exit_ctls_high; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.msrs.entry_ctls_low; - highp = &vmx->nested.msrs.entry_ctls_high; + *low = &msrs->entry_ctls_low; + *high = &msrs->entry_ctls_high; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.msrs.secondary_ctls_low; - highp = &vmx->nested.msrs.secondary_ctls_high; + *low = &msrs->secondary_ctls_low; + *high = &msrs->secondary_ctls_high; break; default: BUG(); } +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u32 *lowp, *highp; + u64 supported; + + vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); supported = vmx_control_msr(*lowp, *highp); @@ -1287,6 +1294,7 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) return -EINVAL; + vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); *lowp = data; *highp = data >> 32; return 0; @@ -1300,10 +1308,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | /* reserved */ GENMASK_ULL(13, 9) | BIT_ULL(31); - u64 vmx_misc; - - vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, + vmcs_config.nested.misc_high); if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) return -EINVAL; @@ -1331,10 +1337,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) { - u64 vmx_ept_vpid_cap; - - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, - vmx->nested.msrs.vpid_caps); + u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, + vmcs_config.nested.vpid_caps); /* Every bit is either reserved or a feature bit. */ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) @@ -1345,20 +1349,21 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) return 0; } -static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) { - u64 *msr; - switch (msr_index) { case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.msrs.cr0_fixed0; - break; + return &msrs->cr0_fixed0; case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.msrs.cr4_fixed0; - break; + return &msrs->cr4_fixed0; default: BUG(); } +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); /* * 1 bits (which indicates bits which "must-be-1" during VMX operation) @@ -1367,7 +1372,7 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(data, *msr, -1ULL)) return -EINVAL; - *msr = data; + *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; return 0; } @@ -1428,7 +1433,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmx->nested.msrs.vmcs_enum = data; return 0; case MSR_IA32_VMX_VMFUNC: - if (data & ~vmx->nested.msrs.vmfunc_controls) + if (data & ~vmcs_config.nested.vmfunc_controls) return -EINVAL; vmx->nested.msrs.vmfunc_controls = data; return 0; @@ -2133,6 +2138,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + /* * If vmcs02 hasn't been initialized, set the constant vmcs02 state * according to L0's settings (vmcs12 is irrelevant here). Host @@ -2175,6 +2182,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + /* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based @@ -2514,11 +2524,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); } if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the @@ -2547,7 +2557,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_get_l2_tsc_multiplier(vcpu)); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); @@ -2613,6 +2623,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -3158,8 +3169,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_host_map *map; - struct page *page; - u64 hpa; if (!vcpu->arch.pdptrs_from_userspace && !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { @@ -3174,23 +3183,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - if (!is_error_page(page)) { - vmx->nested.apic_access_page = page; - hpa = page_to_phys(vmx->nested.apic_access_page); - vmcs_write64(APIC_ACCESS_ADDR, hpa); + map = &vmx->nested.apic_access_page_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); } else { - pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", __func__); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = @@ -3373,11 +3371,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (kvm_mpx_supported() && - !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) + vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* @@ -4096,8 +4096,6 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; } @@ -4336,7 +4334,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); @@ -4609,7 +4608,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); if (vmx->nested.l1_tpr_threshold != -1) @@ -4626,10 +4625,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, } /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; @@ -4828,28 +4824,6 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl) -{ - struct vcpu_vmx *vmx; - - if (!nested_vmx_allowed(vcpu)) - return; - - vmx = to_vmx(vcpu); - if (vcpu_has_perf_global_ctrl) { - vmx->nested.msrs.entry_ctls_high |= - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high |= - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } else { - vmx->nested.msrs.entry_ctls_high &= - ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high &= - ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } -} - static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, int *ret) { @@ -4952,7 +4926,7 @@ out_vmcs02: } /* Emulate the VMXON instruction. */ -static int handle_vmon(struct kvm_vcpu *vcpu) +static int handle_vmxon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; @@ -4962,20 +4936,25 @@ static int handle_vmon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * The Intel VMX Instruction Reference lists a bunch of bits that are - * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. But most faulting conditions - * have already been checked by hardware, prior to the VM-exit for - * VMXON. We do test guest cr4.VMXE because processor CR4 always has - * that bit set to 1 in non-root mode. + * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks + * that have higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON), as KVM must load valid CR0/CR4 values into hardware while + * running the guest, i.e. KVM needs to check the _guest_ values. + * + * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and + * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real + * Mode, but KVM will never take the guest out of those modes. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - /* CPL=0 must be checked manually. */ + /* + * CPL=0 and all other checks that are lower priority than VM-Exit must + * be checked manually. + */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); return 1; @@ -5044,7 +5023,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) } /* Emulate the VMXOFF instruction */ -static int handle_vmoff(struct kvm_vcpu *vcpu) +static int handle_vmxoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; @@ -6111,6 +6090,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); case EXIT_REASON_ENCLS: return nested_vmx_exit_handled_encls(vcpu, vmcs12); + case EXIT_REASON_NOTIFY: + /* Notify VM exit is not exposed to L1 */ + return false; default: return true; } @@ -6775,6 +6757,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + if (vmx_umip_emulated()) + msrs->cr4_fixed1 |= X86_CR4_UMIP; + msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } @@ -6818,8 +6803,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; - exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; - exit_handlers[EXIT_REASON_VMON] = handle_vmon; + exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; + exit_handlers[EXIT_REASON_VMON] = handle_vmxon; exit_handlers[EXIT_REASON_INVEPT] = handle_invept; exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index c92cea0b8ccc..88b00a7359e4 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,8 +32,6 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); @@ -281,7 +279,8 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; - return fixed_bits_valid(val, fixed0, fixed1); + return fixed_bits_valid(val, fixed0, fixed1) && + __kvm_is_valid_cr4(vcpu, val); } /* No difference in the restrictions on guest and host CR4 in VMX operation. */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 37e9eb32e3d9..862c1a4d971b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -37,23 +37,35 @@ static int fixed_pmc_events[] = {1, 0, 7}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { + struct kvm_pmc *pmc; + u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; + pmu->fixed_ctr_ctrl = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); - u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); - struct kvm_pmc *pmc; - - pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); if (old_ctrl == new_ctrl) continue; + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_fixed_counter(pmc, new_ctrl, i); + reprogram_counter(pmc); } +} - pmu->fixed_ctr_ctrl = data; +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) { + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + } else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } } /* function is called when global control register has been updated. */ @@ -61,14 +73,18 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) { int bit; u64 diff = pmu->global_ctrl ^ data; + struct kvm_pmc *pmc; pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_counter(pmu, bit); + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + if (pmc) + reprogram_counter(pmc); + } } -static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool intel_hw_event_available(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; @@ -82,15 +98,12 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) /* disable event that reported as not present by cpuid */ if ((i < 7) && !(pmu->available_event_types & (1 << i))) - return PERF_COUNT_HW_MAX + 1; + return false; break; } - if (i == ARRAY_SIZE(intel_arch_events)) - return PERF_COUNT_HW_MAX; - - return intel_arch_events[i].event_type; + return true; } /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ @@ -98,19 +111,10 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + if (!intel_pmu_has_perf_global_ctrl(pmu)) + return true; - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) @@ -167,16 +171,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) -{ - /* - * As a first step, a guest could only enable LBR feature if its - * cpu model is the same as the host because the LBR registers - * would be pass-through to the guest and they're model specific. - */ - return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); -} - bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) { struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); @@ -205,6 +199,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 perf_capabilities; int ret; switch (msr) { @@ -212,7 +207,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_STATUS: case MSR_CORE_PERF_GLOBAL_CTRL: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; + return intel_pmu_has_perf_global_ctrl(pmu); + break; + case MSR_IA32_PEBS_ENABLE: + ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; + break; + case MSR_IA32_DS_AREA: + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + break; + case MSR_PEBS_DATA_CFG: + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || @@ -361,6 +367,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; return 0; + case MSR_IA32_PEBS_ENABLE: + msr_info->data = pmu->pebs_enable; + return 0; + case MSR_IA32_DS_AREA: + msr_info->data = pmu->ds_area; + return 0; + case MSR_PEBS_DATA_CFG: + msr_info->data = pmu->pebs_data_cfg; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -395,7 +410,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_FIXED_CTR_CTRL: if (pmu->fixed_ctr_ctrl == data) return 0; - if (!(data & 0xfffffffffffff444ull)) { + if (!(data & pmu->fixed_ctr_ctrl_mask)) { reprogram_fixed_counters(pmu, data); return 0; } @@ -421,6 +436,29 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PEBS_ENABLE: + if (pmu->pebs_enable == data) + return 0; + if (!(data & pmu->pebs_enable_mask)) { + pmu->pebs_enable = data; + return 0; + } + break; + case MSR_IA32_DS_AREA: + if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (is_noncanonical_address(data, vcpu)) + return 1; + pmu->ds_area = data; + return 0; + case MSR_PEBS_DATA_CFG: + if (pmu->pebs_data_cfg == data) + return 0; + if (!(data & pmu->pebs_data_cfg_mask)) { + pmu->pebs_data_cfg = data; + return 0; + } + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -445,7 +483,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { - reprogram_gp_counter(pmc, data); + pmc->eventsel = data; + reprogram_counter(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) @@ -474,11 +513,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - - struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 perf_capabilities; + u64 counter_mask; + int i; pmu->nr_arch_gp_counters = 0; pmu->nr_arch_fixed_counters = 0; @@ -487,8 +527,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_mask = ~0ull; + pmu->global_ovf_ctrl_mask = ~0ull; + pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + entry = kvm_find_cpuid_entry(vcpu, 0xa); if (!entry || !vcpu->kvm->arch.enable_pmu) return; eax.full = entry->eax; @@ -498,13 +543,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; - perf_get_x86_pmu_capability(&x86_pmu); - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - x86_pmu.num_counters_gp); - eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); + kvm_pmu_cap.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, + kvm_pmu_cap.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); + eax.split.mask_length = min_t(int, eax.split.mask_length, + kvm_pmu_cap.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -514,17 +559,19 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = min3(ARRAY_SIZE(fixed_pmc_events), (size_t) edx.split.num_counters_fixed, - (size_t) x86_pmu.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, - edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); + (size_t)kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; setup_fixed_pmc_eventsel(pmu); } - pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); + counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + pmu->global_ctrl_mask = counter_mask; pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); @@ -532,7 +579,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry(vcpu, 7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { @@ -545,16 +592,29 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_refresh(vcpu, - intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); - - if (intel_pmu_lbr_is_compatible(vcpu)) + if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); else lbr_desc->records.nr = 0; if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { + pmu->pebs_enable_mask = counter_mask; + pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmu->fixed_ctr_ctrl_mask &= + ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + } + pmu->pebs_data_cfg_mask = ~0xff00000full; + } else { + pmu->pebs_enable_mask = + ~((1ull << pmu->nr_arch_gp_counters) - 1); + } + } } static void intel_pmu_init(struct kvm_vcpu *vcpu) @@ -719,8 +779,28 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) +{ + struct kvm_pmc *pmc = NULL; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + + if (!pmc || !pmc_speculative_in_use(pmc) || + !intel_pmc_is_enabled(pmc)) + continue; + + if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) { + pmu->host_cross_mapped_mask |= + BIT_ULL(pmc->perf_event->hw.idx); + } + } +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { - .pmc_perf_hw_id = intel_pmc_perf_hw_id, + .hw_event_available = intel_hw_event_available, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 07e5fcf5a5aa..1b56c5e5c9fb 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -34,7 +34,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) +static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) { /* * PID.ON can be set at any time by a different vCPU or by hardware, @@ -42,7 +42,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) * update must be retried with a fresh snapshot an ON change causes * the cmpxchg to fail. */ - if (cmpxchg64(&pi_desc->control, old, new) != old) + if (!try_cmpxchg64(&pi_desc->control, pold, new)) return -EBUSY; return 0; @@ -96,8 +96,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!x2apic_mode) dest = (dest << 8) & 0xFF00; + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); + new.control = old.control; /* * Clear SN (as above) and refresh the destination APIC ID to @@ -111,7 +112,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * descriptor was modified on "put" to use the wakeup vector. */ new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); local_irq_restore(flags); @@ -156,12 +157,12 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); - /* set 'NV' to 'wakeup vector' */ + new.control = old.control; new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); /* * Send a wakeup IPI to this CPU if an interrupt may have been posted @@ -177,11 +178,24 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) local_irq_restore(flags); } +static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) +{ + /* + * The default posted interrupt vector does nothing when + * invoked outside guest mode. Return whether a blocked vCPU + * can be the target of posted interrupts, as is the case when + * using either IPI virtualization or VT-d PI, so that the + * notification vector is switched to the one that calls + * back to the pi_wakeup_handler() function. + */ + return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); +} + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!vmx_can_use_vtd_pi(vcpu->kvm)) + if (!vmx_needs_pi_wakeup(vcpu)) return; if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 9a45d5c9f116..26992076552e 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -5,6 +5,8 @@ #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 +#define PID_TABLE_ENTRY_VALID 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 35e7ec91ae86..aba8cebdc587 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -79,7 +79,7 @@ static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, else *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); - if (*gpa == UNMAPPED_GVA) { + if (*gpa == INVALID_GPA) { kvm_inject_emulated_page_fault(vcpu, &ex); return -EFAULT; } @@ -148,8 +148,8 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, u8 max_size_log2; int trapnr, ret; - sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); - sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { kvm_prepare_emulation_failure_exit(vcpu); return 0; @@ -431,7 +431,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (!vcpu->kvm->arch.sgx_provisioning_allowed) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); if (!guest_cpuid) return true; @@ -439,7 +439,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!guest_cpuid) return true; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 2b9d7a7e83f7..ac290a44a693 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -50,6 +50,7 @@ struct vmcs_controls_shadow { u32 pin; u32 exec; u32 secondary_exec; + u64 tertiary_exec; }; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index be7c19374fdd..d7f8331d6f7e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO); module_param(enable_apicv, bool, S_IRUGO); +bool __read_mostly enable_ipiv = true; +module_param(enable_ipiv, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -116,6 +119,9 @@ module_param(nested, bool, S_IRUGO); bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +static bool __read_mostly error_on_inconsistent_vmcs_config = true; +module_param(error_on_inconsistent_vmcs_config, bool, 0444); + static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); @@ -443,18 +449,20 @@ asmlinkage void vmread_error(unsigned long field, bool fault) noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) @@ -1787,7 +1795,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) return vmcs12->tsc_multiplier; - return kvm_default_tsc_scaling_ratio; + return kvm_caps.default_tsc_scaling_ratio; } static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -2111,6 +2119,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (is_noncanonical_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; + + if (is_guest_mode(vcpu) && + ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || + (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) + get_vmcs12(vcpu)->guest_bndcfgs = data; + vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: @@ -2312,7 +2326,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; - if (!intel_pmu_lbr_is_compatible(vcpu)) + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); @@ -2489,6 +2514,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } +static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +{ + u64 allowed; + + rdmsrl(msr, allowed); + + return ctl_opt & allowed; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2497,8 +2531,26 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; + u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + int i; + + /* + * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. + * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always + * intercepts writes to PAT and EFER, i.e. never enables those controls. + */ + struct { + u32 entry_control; + u32 exit_control; + } const vmcs_entry_exit_pairs[] = { + { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, + { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, + { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, + { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, + { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); min = CPU_BASED_HLT_EXITING | @@ -2518,7 +2570,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -2551,7 +2604,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_BUS_LOCK_DETECTION; + SECONDARY_EXEC_BUS_LOCK_DETECTION | + SECONDARY_EXEC_NOTIFY_VM_EXITING; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -2581,15 +2635,30 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); } else if (vmx_cap->ept) { - vmx_cap->ept = 0; pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->ept = 0; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_cap->vpid) { - vmx_cap->vpid = 0; + vmx_cap->vpid) { pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->vpid = 0; + } + + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { + u64 opt3 = TERTIARY_EXEC_IPI_VIRT; + + _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, + MSR_IA32_VMX_PROCBASED_CTLS3); } min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; @@ -2630,6 +2699,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control) < 0) return -EIO; + for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { + u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; + u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; + + if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) + continue; + + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", + _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + _vmentry_control &= ~n_ctrl; + _vmexit_control &= ~x_ctrl; + } + /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they * can't be used due to an errata where VM Exit may incorrectly clear @@ -2678,6 +2764,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; @@ -3230,8 +3317,8 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be - * enabled under SMM. Note, whether or not VMXE is allowed at all is - * handled by kvm_is_valid_cr4(). + * enabled under SMM. Note, whether or not VMXE is allowed at all, + * i.e. is a reserved bit, is handled by common x86 code. */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; @@ -3702,7 +3789,7 @@ static int init_rmode_identity_map(struct kvm *kvm) } /* Set up identity-mapping pagetable for EPT in real mode */ - for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { @@ -3932,6 +4019,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + if (enable_ipiv) + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); } } @@ -3977,20 +4066,26 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) u32 i; /* - * Set intercept permissions for all potentially passed through MSRs - * again. They will automatically get filtered through the MSR filter, - * so we are back in sync after this. + * Redo intercept permissions for MSRs that KVM is passing through to + * the guest. Disabling interception will check the new MSR filter and + * ensure that KVM enables interception if usersepace wants to filter + * the MSR. MSRs that KVM is already intercepting don't need to be + * refreshed since KVM is going to intercept them regardless of what + * userspace wants. */ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { u32 msr = vmx_possible_passthrough_msrs[i]; - bool read = test_bit(i, vmx->shadow_msr_intercept.read); - bool write = test_bit(i, vmx->shadow_msr_intercept.write); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + if (!test_bit(i, vmx->shadow_msr_intercept.read)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); + + if (!test_bit(i, vmx->shadow_msr_intercept.write)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); } - pt_update_intercept_for_msr(vcpu); + /* PT MSRs can be passed through iff PT is exposed to the guest. */ + if (vmx_pt_mode_is_host_guest()) + pt_update_intercept_for_msr(vcpu); } static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4085,7 +4180,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!r) return 0; - if (!vcpu->arch.apicv_active) + /* Note, this is called iff the local APIC is in-kernel. */ + if (!vcpu->arch.apic->apicv_active) return -1; if (pi_test_and_set_pir(vector, &vmx->pi_desc)) @@ -4259,15 +4355,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + + if (kvm_vcpu_apicv_active(vcpu)) { + secondary_exec_controls_setbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } else { + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); } vmx_update_msr_bitmap_x2apic(vcpu); @@ -4299,6 +4399,20 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; + + /* + * IPI virtualization relies on APICv. Disable IPI virtualization if + * APICv is inhibited. + */ + if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) + exec_control &= ~TERTIARY_EXEC_IPI_VIRT; + + return exec_control; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4441,13 +4555,48 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + if (!kvm_notify_vmexit_enabled(vcpu->kvm)) + exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; + return exec_control; } +static inline int vmx_get_pid_table_order(struct kvm *kvm) +{ + return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); +} + +static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) +{ + struct page *pages; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_ipiv) + return 0; + + if (kvm_vmx->pid_table) + return 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + if (!pages) + return -ENOMEM; + + kvm_vmx->pid_table = (void *)page_address(pages); + return 0; +} + +static int vmx_vcpu_precreate(struct kvm *kvm) +{ + return vmx_alloc_ipiv_pid_table(kvm); +} + #define VMX_XSS_EXIT_BITMAP 0 static void init_vmcs(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); @@ -4464,6 +4613,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); @@ -4476,12 +4628,20 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { + if (vmx_can_use_ipiv(&vmx->vcpu)) { + vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); + vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); + } + + if (!kvm_pause_in_guest(kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; } + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -4652,13 +4812,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu) +static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; int irq = vcpu->arch.interrupt.nr; - trace_kvm_inj_virq(irq); + trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { @@ -5770,6 +5930,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) return 1; } +static int handle_notify(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; + + ++vcpu->stat.notify_window_exits; + + /* + * Notify VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || + context_invalid) { + vcpu->run->exit_reason = KVM_EXIT_NOTIFY; + vcpu->run->notify.flags = context_invalid ? + KVM_NOTIFY_CONTEXT_INVALID : 0; + return 0; + } + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5827,6 +6013,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, + [EXIT_REASON_NOTIFY] = handle_notify, }; static const int kvm_vmx_max_exit_handlers = @@ -5924,6 +6111,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; unsigned long cr4; int efer_slot; @@ -5937,9 +6125,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - secondary_exec_control = 0; + if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); @@ -6039,9 +6234,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), @@ -6191,7 +6387,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && - exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + exit_reason.basic != EXIT_REASON_TASK_SWITCH && + exit_reason.basic != EXIT_REASON_NOTIFY)) { int ndata = 3; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -6453,7 +6650,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) put_page(page); } -static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +static void vmx_hwapic_isr_update(int max_isr) { u16 status; u8 old; @@ -6783,9 +6980,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; struct perf_guest_switch_msr *msrs; + struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ - msrs = perf_guest_get_msrs(&nr_msrs); + msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) return; @@ -7166,6 +7368,10 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + if (vmx_can_use_ipiv(vcpu)) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + return 0; free_vmcs: @@ -7234,7 +7440,7 @@ static int __init vmx_check_processor_compat(void) return 0; } -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; @@ -7310,7 +7516,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + entry = kvm_find_cpuid_entry(vcpu, 0x1); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); @@ -7326,7 +7532,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); @@ -7337,23 +7543,6 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } -static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (kvm_mpx_supported()) { - bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); - - if (mpx_enabled) { - vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - } else { - vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; - } - } -} - static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7361,7 +7550,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { - best = kvm_find_cpuid_entry(vcpu, 0x14, i); + best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; @@ -7445,10 +7634,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (nested_vmx_allowed(vcpu)) { + if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); - nested_vmx_entry_exit_ctls_update(vcpu); - } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) @@ -7502,6 +7689,13 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } + + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); @@ -7514,7 +7708,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - supported_xss = 0; + kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7655,9 +7849,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, + kvm_caps.tsc_scaling_ratio_frac_bits, vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; @@ -7729,6 +7923,13 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * TODO: Implement custom flows for forcing the vCPU out/in of L2 on + * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong + * SMI and RSM only modify state that is saved and restored via SMRAM. + * E.g. most MSRs are left untouched, but many are modified by VM-Exit + * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. + */ vmx->nested.smm.guest_mode = is_guest_mode(vcpu); if (vmx->nested.smm.guest_mode) nested_vmx_vmexit(vcpu, -1, 0, 0); @@ -7802,6 +8003,13 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) return supported & BIT(reason); } +static void vmx_vm_destroy(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", @@ -7813,7 +8021,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + .vcpu_precreate = vmx_vcpu_precreate, .vcpu_create = vmx_vcpu_create, .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, @@ -8027,8 +8237,8 @@ static __init int hardware_setup(void) } if (!cpu_has_vmx_mpx()) - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | - XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) @@ -8091,12 +8301,16 @@ static __init int hardware_setup(void) if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; + if (!enable_apicv || !cpu_has_vmx_ipiv()) + enable_ipiv = false; + if (cpu_has_vmx_tsc_scaling()) - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 48; + kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ @@ -8153,11 +8367,12 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_mce_cap_supported |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_CMCI_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; - if (!enable_ept || !cpu_has_vmx_intel_pt()) + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1e7f9453894b..fb8e3480a9d7 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -92,10 +92,22 @@ union vmx_exit_reason { u32 full; }; +static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + */ + return pmu->version > 1; +} + #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); @@ -205,7 +217,7 @@ struct nested_vmx { * Guest pages referred to in the vmcs02 with host-physical * pointers, so we must keep them pinned while L2 runs. */ - struct page *apic_access_page; + struct kvm_host_map apic_access_page_map; struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; @@ -220,9 +232,18 @@ struct nested_vmx { bool has_preemption_timer_deadline; bool preemption_timer_expired; - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - u64 vmcs01_guest_bndcfgs; + /* + * Used to snapshot MSRs that are conditionally loaded on VM-Enter in + * order to propagate the guest's pre-VM-Enter value into vmcs02. For + * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value. + * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_ + * userspace restores MSRs before nested state. If userspace restores + * MSRs after nested state, the snapshot holds garbage, but KVM can't + * detect that, and the garbage value in vmcs02 will be overwritten by + * MSR restoration in any case. + */ + u64 pre_vmenter_debugctl; + u64 pre_vmenter_bndcfgs; /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold; @@ -369,6 +390,8 @@ struct kvm_vmx { unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + /* Posted Interrupt Descriptor (PID) table for IPI virtualization */ + u64 *pid_table; }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -462,35 +485,36 @@ static inline u8 vmx_get_rvi(void) return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } -#define BUILD_CONTROLS_SHADOW(lname, uname) \ -static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ -{ \ - if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ - vmcs_write32(uname, val); \ - vmx->loaded_vmcs->controls_shadow.lname = val; \ - } \ -} \ -static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ -{ \ - return vmcs->controls_shadow.lname; \ -} \ -static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ -{ \ - return __##lname##_controls_get(vmx->loaded_vmcs); \ -} \ -static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ -} \ -static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write##bits(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ +} \ +static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ +static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ +} \ +static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } -BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) -BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) -BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) /* * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the @@ -586,4 +610,9 @@ static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) return (vmx_instr_info >> 28) & 0xf; } +static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) +{ + return lapic_in_kernel(vcpu) && enable_ipiv; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5fa335a4ea7..33560bfa0cac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,8 +87,11 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); + +struct kvm_caps kvm_caps __read_mostly = { + .supported_mce_cap = MCG_CTL_P | MCG_SER_P, +}; +EXPORT_SYMBOL_GPL(kvm_caps); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -151,19 +154,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); -bool __read_mostly kvm_has_bus_lock_exit; -EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); @@ -235,8 +225,6 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); -u64 __read_mostly supported_xss; -EXPORT_SYMBOL_GPL(supported_xss); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -298,7 +286,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_COUNTER(VCPU, preemption_reported), STATS_DESC_COUNTER(VCPU, preemption_other), - STATS_DESC_IBOOLEAN(VCPU, guest_mode) + STATS_DESC_IBOOLEAN(VCPU, guest_mode), + STATS_DESC_COUNTER(VCPU, notify_window_exits), }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -311,8 +300,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { }; u64 __read_mostly host_xcr0; -u64 __read_mostly supported_xcr0; -EXPORT_SYMBOL_GPL(supported_xcr0); static struct kmem_cache *x86_emulator_cache; @@ -862,7 +849,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) */ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ @@ -1094,7 +1081,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return false; @@ -1102,9 +1089,15 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return false; - return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); + return true; +} +EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); + +static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return __kvm_is_valid_cr4(vcpu, cr4) && + static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } -EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { @@ -1450,6 +1443,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, @@ -2051,13 +2045,6 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); -int kvm_emulate_mwait(struct kvm_vcpu *vcpu) -{ - pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); - return kvm_emulate_as_nop(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); - int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -2065,11 +2052,26 @@ int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); -int kvm_emulate_monitor(struct kvm_vcpu *vcpu) + +static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && + !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + return kvm_handle_invalid_op(vcpu); + + pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); +} EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) @@ -2349,12 +2351,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) /* Guest TSC same frequency as host TSC? */ if (!scale) { - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { + if (!kvm_caps.has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; @@ -2366,10 +2368,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) } /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; @@ -2387,7 +2389,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return -1; } @@ -2464,18 +2466,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * - * N equals to kvm_tsc_scaling_ratio_frac_bits. + * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); + return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); } u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; - if (ratio != kvm_default_tsc_scaling_ratio) + if (ratio != kvm_caps.default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; @@ -2502,11 +2504,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { u64 nested_offset; - if (l2_multiplier == kvm_default_tsc_scaling_ratio) + if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) nested_offset = l1_offset; else nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); nested_offset += l2_offset; return nested_offset; @@ -2515,9 +2517,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { - if (l2_multiplier != kvm_default_tsc_scaling_ratio) + if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) return mul_u64_u64_shr(l1_multiplier, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); return l1_multiplier; } @@ -2559,7 +2561,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli else vcpu->arch.tsc_scaling_ratio = l1_multiplier; - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) static_call(kvm_x86_write_tsc_multiplier)( vcpu, vcpu->arch.tsc_scaling_ratio); } @@ -2695,7 +2697,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); @@ -3108,7 +3110,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); @@ -3198,6 +3200,16 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ @@ -3216,6 +3228,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3229,32 +3242,53 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; - - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return -1; - } + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - vcpu->arch.mce_banks[offset] = data; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) + return 1; + /* An attempt to write a 1 to a reserved bit raises #GP */ + if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + vcpu->arch.mci_ctl2_banks[offset] = data; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if (is_mci_control_msr(msr) && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; @@ -3538,7 +3572,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -3560,9 +3595,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_tsc_adjust_msr = data; } break; - case MSR_IA32_MISC_ENABLE: + case MSR_IA32_MISC_ENABLE: { + u64 old_val = vcpu->arch.ia32_misc_enable_msr; + + if (!msr_info->host_initiated) { + /* RO bits */ + if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) + return 1; + + /* R bits, i.e. writes are ignored, but don't fault. */ + data = data & ~MSR_IA32_MISC_ENABLE_EMON; + data |= old_val & MSR_IA32_MISC_ENABLE_EMON; + } + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && - ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; @@ -3571,6 +3618,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_misc_enable_msr = data; } break; + } case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; @@ -3597,7 +3645,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data & ~supported_xss) + if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; kvm_update_cpuid_runtime(vcpu); @@ -3695,6 +3743,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: @@ -3785,6 +3834,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + /* + * Userspace is allowed to write '0' to MSRs that KVM reports + * as to-be-saved, even if an MSRs isn't fully supported. + */ + return !msr_info->host_initiated || data; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -3799,6 +3859,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3816,16 +3877,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && !host) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + data = vcpu->arch.mci_ctl2_banks[offset]; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; @@ -3865,9 +3937,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + /* + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. + */ if (!msr_info->host_initiated) return 1; msr_info->data = 0; @@ -3922,7 +4001,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_MTRRcap: - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -4038,6 +4118,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: @@ -4280,6 +4361,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: @@ -4296,6 +4378,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SYS_ATTRIBUTES: case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4357,7 +4440,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_TSC_CONTROL: case KVM_CAP_VM_TSC_CONTROL: - r = kvm_has_tsc_control; + r = kvm_caps.has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; @@ -4379,7 +4462,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = sched_info_on(); break; case KVM_CAP_X86_BUS_LOCK_EXIT: - if (kvm_has_bus_lock_exit) + if (kvm_caps.has_bus_lock_exit) r = KVM_BUS_LOCK_DETECTION_OFF | KVM_BUS_LOCK_DETECTION_EXIT; else @@ -4388,17 +4471,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XSAVE2: { u64 guest_perm = xstate_get_guest_group_perm(); - r = xstate_required_size(supported_xcr0 & guest_perm, false); + r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false); if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; + } case KVM_CAP_PMU_CAPABILITY: r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; - } case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; + break; default: break; } @@ -4426,7 +4512,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: - if (put_user(supported_xcr0, uaddr)) + if (put_user(kvm_caps.supported_xcr0, uaddr)) return -EFAULT; return 0; default: @@ -4503,8 +4589,8 @@ long kvm_arch_dev_ioctl(struct file *filp, } case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) + if (copy_to_user(argp, &kvm_caps.supported_mce_cap, + sizeof(kvm_caps.supported_mce_cap))) goto out; r = 0; break; @@ -4803,22 +4889,63 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; /* Init IA32_MCG_CTL to all 1s */ if (mcg_cap & MCG_CTL_P) vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) + /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ + for (bank = 0; bank < bank_num; bank++) { vcpu->arch.mce_banks[bank*4] = ~(u64)0; + if (mcg_cap & MCG_CMCI_P) + vcpu->arch.mci_ctl2_banks[bank] = 0; + } + + kvm_apic_after_set_mcg_cap(vcpu); static_call(kvm_x86_setup_mce)(vcpu); out: return r; } +/* + * Validate this is an UCNA (uncorrectable no action) error by checking the + * MCG_STATUS and MCi_STATUS registers: + * - none of the bits for Machine Check Exceptions are set + * - both the VAL (valid) and UC (uncorrectable) bits are set + * MCI_STATUS_PCC - Processor Context Corrupted + * MCI_STATUS_S - Signaled as a Machine Check Exception + * MCI_STATUS_AR - Software recoverable Action Required + */ +static bool is_ucna(struct kvm_x86_mce *mce) +{ + return !mce->mcg_status && + !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) && + (mce->status & MCI_STATUS_VAL) && + (mce->status & MCI_STATUS_UC); +} + +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + + banks[1] = mce->status; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + + if (!(mcg_cap & MCG_CMCI_P) || + !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN)) + return 0; + + if (lapic_in_kernel(vcpu)) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI); + + return 0; +} + static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce) { @@ -4828,6 +4955,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) return -EINVAL; + + banks += array_index_nospec(4 * mce->bank, 4 * bank_num); + + if (is_ucna(mce)) + return kvm_vcpu_x86_set_ucna(vcpu, mce, banks); + /* * if IA32_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled @@ -4835,7 +4968,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && vcpu->arch.mcg_ctl != ~(u64)0) return 0; - banks += 4 * mce->bank; /* * if IA32_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank @@ -4941,6 +5073,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + if (vcpu->kvm->arch.triple_fault_event) { + events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); + events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + } memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -4954,7 +5090,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM - | KVM_VCPUEVENT_VALID_PAYLOAD)) + | KVM_VCPUEVENT_VALID_PAYLOAD + | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { @@ -5027,6 +5164,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, } } + if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + if (!vcpu->kvm->arch.triple_fault_event) + return -EINVAL; + if (events->triple_fault.pending) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + else + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -5095,7 +5241,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, guest_xsave->region, - supported_xcr0, &vcpu->arch.pkru); + kvm_caps.supported_xcr0, + &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -5600,8 +5747,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6028,6 +6175,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + kvm->arch.triple_fault_event = cap->args[0]; + r = 0; + break; case KVM_CAP_X86_USER_SPACE_MSR: r = -EINVAL; if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | @@ -6046,7 +6197,7 @@ split_irqchip_unlock: (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) break; - if (kvm_has_bus_lock_exit && + if (kvm_caps.has_bus_lock_exit && cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) kvm->arch.bus_lock_detection_enabled = true; r = 0; @@ -6109,6 +6260,65 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_IDS) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = -EINVAL; + if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) + break; + if (!kvm_caps.has_notify_vmexit) + break; + if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) + break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.notify_window = cap->args[0] >> 32; + kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -6584,8 +6794,8 @@ set_pit2_out: r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6660,15 +6870,12 @@ out: static void kvm_init_msr_list(void) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, "Please update the fixed PMCs in msrs_to_saved_all[]"); - perf_get_x86_pmu_capability(&x86_pmu); - num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; @@ -6720,12 +6927,12 @@ static void kvm_init_msr_list(void) break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_IA32_XFD: @@ -6882,7 +7089,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, offset, toread); @@ -6913,7 +7120,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, /* Inline kvm_read_guest_virt_helper for speed. */ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, exception); - if (unlikely(gpa == UNMAPPED_GVA)) + if (unlikely(gpa == INVALID_GPA)) return X86EMUL_PROPAGATE_FAULT; offset = addr & (PAGE_SIZE-1); @@ -6983,7 +7190,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { @@ -7094,7 +7301,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); - if (*gpa == UNMAPPED_GVA) + if (*gpa == INVALID_GPA) return -1; return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); @@ -7331,7 +7538,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (gpa == UNMAPPED_GVA || + if (gpa == INVALID_GPA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -7385,36 +7592,47 @@ emul_write: return emulator_write_emulated(ctxt, addr, new, bytes, exception); } -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *data, + unsigned int count, bool in) { - int r = 0, i; + unsigned i; + int r; - for (i = 0; i < vcpu->arch.pio.count; i++) { - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + WARN_ON_ONCE(vcpu->arch.pio.count); + for (i = 0; i < count; i++) { + if (in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - if (r) + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); + + if (r) { + if (i == 0) + goto userspace_io; + + /* + * Userspace must have unregistered the device while PIO + * was running. Drop writes / read as 0. + */ + if (in) + memset(data, 0, size * (count - i)); break; - pd += vcpu->arch.pio.size; + } + + data += size; } - return r; -} + return 1; -static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, - unsigned int count, bool in) -{ +userspace_io: vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; + vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) - return 1; + if (in) + memset(vcpu->arch.pio_data, 0, size * count); + else + memcpy(vcpu->arch.pio_data, data, size * count); vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -7422,30 +7640,33 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; - return 0; } -static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { - WARN_ON(vcpu->arch.pio.count); - memset(vcpu->arch.pio_data, 0, size * count); - return emulator_pio_in_out(vcpu, size, port, count, true); + int r = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (r) + trace_kvm_pio(KVM_PIO_IN, port, size, count, val); + + return r; } static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) { int size = vcpu->arch.pio.size; - unsigned count = vcpu->arch.pio.count; + unsigned int count = vcpu->arch.pio.count; memcpy(val, vcpu->arch.pio_data, size * count); trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); if (vcpu->arch.pio.count) { /* * Complete a previous iteration that required userspace I/O. @@ -7454,39 +7675,19 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, * shenanigans as KVM doesn't support modifying the rep count, * and the emulator ensures @count doesn't overflow the buffer. */ - } else { - int r = __emulator_pio_in(vcpu, size, port, count); - if (!r) - return r; - - /* Results already available, fall through. */ + complete_emulator_pio_in(vcpu, val); + return 1; } - complete_emulator_pio_in(vcpu, val); - return 1; -} - -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) -{ - return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); - + return emulator_pio_in(vcpu, size, port, val, count); } static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { - int ret; - - memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - ret = emulator_pio_in_out(vcpu, size, port, count, false); - if (ret) - vcpu->arch.pio.count = 0; - - return ret; + trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); + return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -7868,7 +8069,16 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); } +static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) +{ + struct kvm *kvm = emul_to_vcpu(ctxt)->kvm; + + if (!kvm->vm_bugged) + kvm_vm_bugged(kvm); +} + static const struct x86_emulate_ops emulate_ops = { + .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, @@ -8145,7 +8355,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * If the mapping is invalid in guest, let cpu retry * it to generate fault. */ - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return true; } @@ -8672,11 +8882,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* For size less than 4 we merge, else we zero extend */ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; - /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform - * the copy and tracing - */ - emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); + complete_emulator_pio_in(vcpu, &val); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -8751,7 +8957,7 @@ static void kvm_hyperv_tsc_notifier(void) /* TSC frequency always matches when on Hyper-V */ for_each_present_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - kvm_max_guest_tsc_khz = tsc_khz; + kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { __kvm_start_pvclock_update(kvm); @@ -8952,25 +9158,23 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { struct kvm_x86_init_ops *ops = opaque; + u64 host_pat; int r; if (kvm_x86_ops.hardware_enable) { pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); - r = -EEXIST; - goto out; + return -EEXIST; } if (!ops->cpu_has_kvm_support()) { pr_err_ratelimited("kvm: no hardware support for '%s'\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (ops->disabled_by_bios()) { pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } /* @@ -8980,27 +9184,37 @@ int kvm_arch_init(void *opaque) */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { printk(KERN_ERR "kvm: inadequate fpu\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } - r = -ENOMEM; + /* + * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes + * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something + * other than WB. Note, EPT doesn't utilize the PAT, but don't bother + * with an exception. PAT[0] is set to WB on RESET and also by the + * kernel, i.e. failure indicates a kernel bug or broken firmware. + */ + if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + (host_pat & GENMASK(2, 0)) != 6) { + pr_err("kvm: host PAT[0] is not WB\n"); + return -EIO; + } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out; + return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + r = -ENOMEM; goto out_free_x86_emulator_cache; } kvm_nr_uret_msrs = 0; @@ -9013,7 +9227,7 @@ int kvm_arch_init(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } if (pi_inject_timer == -1) @@ -9031,7 +9245,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out: return r; } @@ -9406,7 +9619,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (vcpu->arch.apicv_active) + if (vcpu->arch.apic->apicv_active) return; if (!vcpu->arch.apic->vapic_addr) @@ -9435,6 +9648,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.injected); + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) vcpu->arch.exception.error_code = false; static_call(kvm_x86_queue_exception)(vcpu); @@ -9470,7 +9688,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, true); can_inject = false; } } @@ -9492,13 +9710,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) /* try to inject new event if pending */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - - vcpu->arch.exception.pending = false; - vcpu->arch.exception.injected = true; - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); @@ -9512,6 +9723,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) } kvm_inject_exception(vcpu); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + can_inject = false; } @@ -9564,7 +9779,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, false); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) @@ -9857,6 +10072,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; bool activate; if (!lapic_in_kernel(vcpu)) @@ -9865,12 +10081,14 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) down_read(&vcpu->kvm->arch.apicv_update_lock); preempt_disable(); - activate = kvm_vcpu_apicv_activated(vcpu); + /* Do not activate APICV when APIC is disabled */ + activate = kvm_vcpu_apicv_activated(vcpu) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED); - if (vcpu->arch.apicv_active == activate) + if (apic->apicv_active == activate) goto out; - vcpu->arch.apicv_active = activate; + apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9880,7 +10098,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * still active when the interrupt got accepted. Make sure * inject_pending_event() is called to check for that. */ - if (!vcpu->arch.apicv_active) + if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); out: @@ -10275,7 +10493,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * per-VM state, and responsing vCPUs must wait for the update * to complete before servicing KVM_REQ_APICV_UPDATE. */ - WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)); + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) @@ -10654,8 +10873,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = cui(vcpu); if (r <= 0) goto out; - } else - WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); + } else { + WARN_ON_ONCE(vcpu->arch.pio.count); + WARN_ON_ONCE(vcpu->mmio_needed); + } if (kvm_run->immediate_exit) { r = -EINTR; @@ -11183,7 +11404,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; + tr->valid = gpa != INVALID_GPA; tr->writeable = 1; tr->usermode = 0; @@ -11276,11 +11497,17 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return 0; + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + + return static_call(kvm_x86_vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -11317,7 +11544,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) * will ensure the vCPU gets the correct state before VM-Entry. */ if (enable_apicv) { - vcpu->arch.apicv_active = true; + vcpu->arch.apic->apicv_active = true; kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } } else @@ -11330,9 +11557,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) + vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) goto fail_free_pio_data; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; @@ -11386,6 +11615,7 @@ free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); fail_free_pio_data: free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: @@ -11431,6 +11661,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); @@ -11510,6 +11741,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.smbase = 0x30000; vcpu->arch.msr_misc_features_enables = 0; + vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); @@ -11526,7 +11759,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry * on RESET. But, go through the motions in case that's ever remedied. */ - cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); static_call(kvm_x86_vcpu_reset)(vcpu, init_event); @@ -11717,6 +11950,8 @@ int kvm_arch_hardware_setup(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_pmu_capability(); + r = ops->hardware_setup(); if (r != 0) return r; @@ -11726,13 +11961,13 @@ int kvm_arch_hardware_setup(void *opaque) kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - supported_xss = 0; + kvm_caps.supported_xss = 0; #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); #undef __kvm_cpu_cap_has - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that * fit into a signed integer. @@ -11740,10 +11975,10 @@ int kvm_arch_hardware_setup(void *opaque) * be 1 on all machines. */ u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; } - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; kvm_init_msr_list(); return 0; } @@ -12331,7 +12566,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { - if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + if (kvm_vcpu_apicv_active(vcpu) && + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) return true; return false; @@ -12773,7 +13009,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) { + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed @@ -12998,6 +13234,12 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) +{ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * size; +} + static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); @@ -13021,8 +13263,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); /* memcpy done already by emulator_pio_out. */ - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + advance_sev_es_emulated_pio(vcpu, count, size); if (!ret) break; @@ -13038,20 +13279,14 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); -static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) -{ - unsigned count = vcpu->arch.pio.count; - complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; -} - static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { + unsigned count = vcpu->arch.pio.count; int size = vcpu->arch.pio.size; int port = vcpu->arch.pio.port; - advance_sev_es_emulated_ins(vcpu); + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + advance_sev_es_emulated_pio(vcpu, count, size); if (vcpu->arch.sev_pio_count) return kvm_sev_es_ins(vcpu, size, port); return 1; @@ -13063,11 +13298,11 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, for (;;) { unsigned int count = min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); - if (!__emulator_pio_in(vcpu, size, port, count)) + if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) break; /* Emulation done by the kernel. */ - advance_sev_es_emulated_ins(vcpu); + advance_sev_es_emulated_pio(vcpu, count, size); if (!vcpu->arch.sev_pio_count) return 1; } @@ -13110,6 +13345,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 588792f00334..1926d2cb8e79 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,27 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +struct kvm_caps { + /* control of guest tsc rate supported? */ + bool has_tsc_control; + /* maximum supported tsc_khz for guests */ + u32 max_guest_tsc_khz; + /* number of bits of the fractional part of the TSC scaling ratio */ + u8 tsc_scaling_ratio_frac_bits; + /* maximum allowed value of TSC scaling ratio */ + u64 max_tsc_scaling_ratio; + /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */ + u64 default_tsc_scaling_ratio; + /* bus lock detection supported? */ + bool has_bus_lock_exit; + /* notify VM exit supported? */ + bool has_notify_vmexit; + + u64 supported_mce_cap; + u64 supported_xcr0; + u64 supported_xss; +}; + void kvm_spurious_fault(void); #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ @@ -283,14 +304,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; -extern u64 supported_xcr0; extern u64 host_xss; -extern u64 supported_xss; + +extern struct kvm_caps kvm_caps; + extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { - return (supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) + return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) == (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); } @@ -344,6 +366,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } +static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) +{ + return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; +} + enum kvm_intr_type { /* Values are arbitrary, but must be non-zero. */ KVM_HANDLING_IRQ = 1, @@ -407,7 +434,7 @@ static inline void kvm_machine_check(void) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 610beba35907..a0c05ccbf4b1 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1049,7 +1049,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, else vcpu->arch.xen.poll_evtchn = -1; - set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; @@ -1071,7 +1071,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, *r = 0; out: /* Really, this is only needed in case of timeout */ - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (unlikely(sched_poll.nr_ports > 1)) kfree(ports); @@ -1311,7 +1311,7 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) int poll_evtchn = vcpu->arch.xen.poll_evtchn; if ((poll_evtchn == port || poll_evtchn == -1) && - test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) { + test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) { kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } @@ -1344,7 +1344,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id); if (!vcpu) return -EINVAL; - WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu)); + WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); } if (!vcpu->arch.xen.vcpu_info_cache.active) @@ -1540,7 +1540,7 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, */ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); if (vcpu) - e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu); + e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx; else e->xen_evtchn.vcpu_idx = -1; |