diff options
author | Linus Torvalds | 2022-08-04 14:59:54 -0700 |
---|---|---|
committer | Linus Torvalds | 2022-08-04 14:59:54 -0700 |
commit | 7c5c3a6177fa9646884114fc7f2e970b0bc50dc9 (patch) | |
tree | 956857522574ae7cb07d2227dc16e53d7e9e00e7 /arch/arm64/include/asm | |
parent | f0a892f599c46af673e47418c47c15e69a7b67f4 (diff) | |
parent | 281106f938d3daaea6f8b6723a8217a2a1ef6936 (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"Quite a large pull request due to a selftest API overhaul and some
patches that had come in too late for 5.19.
ARM:
- Unwinder implementations for both nVHE modes (classic and
protected), complete with an overflow stack
- Rework of the sysreg access from userspace, with a complete rewrite
of the vgic-v3 view to allign with the rest of the infrastructure
- Disagregation of the vcpu flags in separate sets to better track
their use model.
- A fix for the GICv2-on-v3 selftest
- A small set of cosmetic fixes
RISC-V:
- Track ISA extensions used by Guest using bitmap
- Added system instruction emulation framework
- Added CSR emulation framework
- Added gfp_custom flag in struct kvm_mmu_memory_cache
- Added G-stage ioremap() and iounmap() functions
- Added support for Svpbmt inside Guest
s390:
- add an interface to provide a hypervisor dump for secure guests
- improve selftests to use TAP interface
- enable interpretive execution of zPCI instructions (for PCI
passthrough)
- First part of deferred teardown
- CPU Topology
- PV attestation
- Minor fixes
x86:
- Permit guests to ignore single-bit ECC errors
- Intel IPI virtualization
- Allow getting/setting pending triple fault with
KVM_GET/SET_VCPU_EVENTS
- PEBS virtualization
- Simplify PMU emulation by just using PERF_TYPE_RAW events
- More accurate event reinjection on SVM (avoid retrying
instructions)
- Allow getting/setting the state of the speaker port data bit
- Refuse starting the kvm-intel module if VM-Entry/VM-Exit controls
are inconsistent
- "Notify" VM exit (detect microarchitectural hangs) for Intel
- Use try_cmpxchg64 instead of cmpxchg64
- Ignore benign host accesses to PMU MSRs when PMU is disabled
- Allow disabling KVM's "MONITOR/MWAIT are NOPs!" behavior
- Allow NX huge page mitigation to be disabled on a per-vm basis
- Port eager page splitting to shadow MMU as well
- Enable CMCI capability by default and handle injected UCNA errors
- Expose pid of vcpu threads in debugfs
- x2AVIC support for AMD
- cleanup PIO emulation
- Fixes for LLDT/LTR emulation
- Don't require refcounted "struct page" to create huge SPTEs
- Miscellaneous cleanups:
- MCE MSR emulation
- Use separate namespaces for guest PTEs and shadow PTEs bitmasks
- PIO emulation
- Reorganize rmap API, mostly around rmap destruction
- Do not workaround very old KVM bugs for L0 that runs with nesting enabled
- new selftests API for CPUID
Generic:
- Fix races in gfn->pfn cache refresh; do not pin pages tracked by
the cache
- new selftests API using struct kvm_vcpu instead of a (vm, id)
tuple"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (606 commits)
selftests: kvm: set rax before vmcall
selftests: KVM: Add exponent check for boolean stats
selftests: KVM: Provide descriptive assertions in kvm_binary_stats_test
selftests: KVM: Check stat name before other fields
KVM: x86/mmu: remove unused variable
RISC-V: KVM: Add support for Svpbmt inside Guest/VM
RISC-V: KVM: Use PAGE_KERNEL_IO in kvm_riscv_gstage_ioremap()
RISC-V: KVM: Add G-stage ioremap() and iounmap() functions
KVM: Add gfp_custom flag in struct kvm_mmu_memory_cache
RISC-V: KVM: Add extensible CSR emulation framework
RISC-V: KVM: Add extensible system instruction emulation framework
RISC-V: KVM: Factor-out instruction emulation into separate sources
RISC-V: KVM: move preempt_disable() call in kvm_arch_vcpu_ioctl_run
RISC-V: KVM: Make kvm_riscv_guest_timer_init a void function
RISC-V: KVM: Fix variable spelling mistake
RISC-V: KVM: Improve ISA extension by using a bitmap
KVM, x86/mmu: Fix the comment around kvm_tdp_mmu_zap_leafs()
KVM: SVM: Dump Virtual Machine Save Area (VMSA) to klog
KVM: x86/mmu: Treat NX as a valid SPTE bit for NPT
KVM: x86: Do not block APIC write for non ICR registers
...
Diffstat (limited to 'arch/arm64/include/asm')
-rw-r--r-- | arch/arm64/include/asm/kvm_asm.h | 16 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_emulate.h | 11 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_host.h | 205 | ||||
-rw-r--r-- | arch/arm64/include/asm/memory.h | 8 | ||||
-rw-r--r-- | arch/arm64/include/asm/stacktrace.h | 62 | ||||
-rw-r--r-- | arch/arm64/include/asm/stacktrace/common.h | 199 | ||||
-rw-r--r-- | arch/arm64/include/asm/stacktrace/nvhe.h | 55 |
7 files changed, 438 insertions, 118 deletions
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 2e277f2ed671..53035763e48e 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -176,6 +176,22 @@ struct kvm_nvhe_init_params { unsigned long vtcr; }; +/* + * Used by the host in EL1 to dump the nVHE hypervisor backtrace on + * hyp_panic() in non-protected mode. + * + * @stack_base: hyp VA of the hyp_stack base. + * @overflow_stack_base: hyp VA of the hyp_overflow_stack base. + * @fp: hyp FP where the backtrace begins. + * @pc: hyp PC where the backtrace begins. + */ +struct kvm_nvhe_stacktrace_info { + unsigned long stack_base; + unsigned long overflow_stack_base; + unsigned long fp; + unsigned long pc; +}; + /* Translate a kernel address @ptr into its equivalent linear mapping */ #define kvm_ksym_ref(ptr) \ ({ \ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 0e66edd3aff2..9bdba47f7e14 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -473,9 +473,18 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) { - vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC; + WARN_ON(vcpu_get_flag(vcpu, PENDING_EXCEPTION)); + vcpu_set_flag(vcpu, INCREMENT_PC); } +#define kvm_pend_exception(v, e) \ + do { \ + WARN_ON(vcpu_get_flag((v), INCREMENT_PC)); \ + vcpu_set_flag((v), PENDING_EXCEPTION); \ + vcpu_set_flag((v), e); \ + } while (0) + + static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature) { return test_bit(feature, vcpu->arch.features); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index de32152cea04..f38ef299f13b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -325,8 +325,30 @@ struct kvm_vcpu_arch { /* Exception Information */ struct kvm_vcpu_fault_info fault; - /* Miscellaneous vcpu state flags */ - u64 flags; + /* Ownership of the FP regs */ + enum { + FP_STATE_FREE, + FP_STATE_HOST_OWNED, + FP_STATE_GUEST_OWNED, + } fp_state; + + /* Configuration flags, set once and for all before the vcpu can run */ + u8 cflags; + + /* Input flags to the hypervisor code, potentially cleared after use */ + u8 iflags; + + /* State flags for kernel bookkeeping, unused by the hypervisor code */ + u8 sflags; + + /* + * Don't run the guest (internal implementation need). + * + * Contrary to the flags above, this is set/cleared outside of + * a vcpu context, and thus cannot be mixed with the flags + * themselves (or the flag accesses need to be made atomic). + */ + bool pause; /* * We maintain more than a single set of debug registers to support @@ -376,9 +398,6 @@ struct kvm_vcpu_arch { /* vcpu power state */ struct kvm_mp_state mp_state; - /* Don't run the guest (internal implementation need) */ - bool pause; - /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; @@ -392,10 +411,6 @@ struct kvm_vcpu_arch { /* Additional reset state */ struct vcpu_reset_state reset_state; - /* True when deferrable sysregs are loaded on the physical CPU, - * see kvm_vcpu_load_sysregs_vhe and kvm_vcpu_put_sysregs_vhe. */ - bool sysregs_loaded_on_cpu; - /* Guest PV state */ struct { u64 last_steal; @@ -403,6 +418,124 @@ struct kvm_vcpu_arch { } steal; }; +/* + * Each 'flag' is composed of a comma-separated triplet: + * + * - the flag-set it belongs to in the vcpu->arch structure + * - the value for that flag + * - the mask for that flag + * + * __vcpu_single_flag() builds such a triplet for a single-bit flag. + * unpack_vcpu_flag() extract the flag value from the triplet for + * direct use outside of the flag accessors. + */ +#define __vcpu_single_flag(_set, _f) _set, (_f), (_f) + +#define __unpack_flag(_set, _f, _m) _f +#define unpack_vcpu_flag(...) __unpack_flag(__VA_ARGS__) + +#define __build_check_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *_fset; \ + \ + /* Check that the flags fit in the mask */ \ + BUILD_BUG_ON(HWEIGHT(m) != HWEIGHT((f) | (m))); \ + /* Check that the flags fit in the type */ \ + BUILD_BUG_ON((sizeof(*_fset) * 8) <= __fls(m)); \ + } while (0) + +#define __vcpu_get_flag(v, flagset, f, m) \ + ({ \ + __build_check_flag(v, flagset, f, m); \ + \ + v->arch.flagset & (m); \ + }) + +#define __vcpu_set_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *fset; \ + \ + __build_check_flag(v, flagset, f, m); \ + \ + fset = &v->arch.flagset; \ + if (HWEIGHT(m) > 1) \ + *fset &= ~(m); \ + *fset |= (f); \ + } while (0) + +#define __vcpu_clear_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *fset; \ + \ + __build_check_flag(v, flagset, f, m); \ + \ + fset = &v->arch.flagset; \ + *fset &= ~(m); \ + } while (0) + +#define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__) +#define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__) +#define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__) + +/* SVE exposed to guest */ +#define GUEST_HAS_SVE __vcpu_single_flag(cflags, BIT(0)) +/* SVE config completed */ +#define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1)) +/* PTRAUTH exposed to guest */ +#define GUEST_HAS_PTRAUTH __vcpu_single_flag(cflags, BIT(2)) + +/* Exception pending */ +#define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0)) +/* + * PC increment. Overlaps with EXCEPT_MASK on purpose so that it can't + * be set together with an exception... + */ +#define INCREMENT_PC __vcpu_single_flag(iflags, BIT(1)) +/* Target EL/MODE (not a single flag, but let's abuse the macro) */ +#define EXCEPT_MASK __vcpu_single_flag(iflags, GENMASK(3, 1)) + +/* Helpers to encode exceptions with minimum fuss */ +#define __EXCEPT_MASK_VAL unpack_vcpu_flag(EXCEPT_MASK) +#define __EXCEPT_SHIFT __builtin_ctzl(__EXCEPT_MASK_VAL) +#define __vcpu_except_flags(_f) iflags, (_f << __EXCEPT_SHIFT), __EXCEPT_MASK_VAL + +/* + * When PENDING_EXCEPTION is set, EXCEPT_MASK can take the following + * values: + * + * For AArch32 EL1: + */ +#define EXCEPT_AA32_UND __vcpu_except_flags(0) +#define EXCEPT_AA32_IABT __vcpu_except_flags(1) +#define EXCEPT_AA32_DABT __vcpu_except_flags(2) +/* For AArch64: */ +#define EXCEPT_AA64_EL1_SYNC __vcpu_except_flags(0) +#define EXCEPT_AA64_EL1_IRQ __vcpu_except_flags(1) +#define EXCEPT_AA64_EL1_FIQ __vcpu_except_flags(2) +#define EXCEPT_AA64_EL1_SERR __vcpu_except_flags(3) +/* For AArch64 with NV (one day): */ +#define EXCEPT_AA64_EL2_SYNC __vcpu_except_flags(4) +#define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5) +#define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6) +#define EXCEPT_AA64_EL2_SERR __vcpu_except_flags(7) +/* Guest debug is live */ +#define DEBUG_DIRTY __vcpu_single_flag(iflags, BIT(4)) +/* Save SPE context if active */ +#define DEBUG_STATE_SAVE_SPE __vcpu_single_flag(iflags, BIT(5)) +/* Save TRBE context if active */ +#define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) + +/* SVE enabled for host EL0 */ +#define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) +/* SME enabled for EL0 */ +#define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) +/* Physical CPU not in supported_cpus */ +#define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2)) +/* WFIT instruction trapped */ +#define IN_WFIT __vcpu_single_flag(sflags, BIT(3)) +/* vcpu system registers loaded on physical CPU */ +#define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) + /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ sve_ffr_offset((vcpu)->arch.sve_max_vl)) @@ -423,70 +556,31 @@ struct kvm_vcpu_arch { __size_ret; \ }) -/* vcpu_arch flags field values: */ -#define KVM_ARM64_DEBUG_DIRTY (1 << 0) -#define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */ -#define KVM_ARM64_FP_HOST (1 << 2) /* host FP regs loaded */ -#define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ -#define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */ -#define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */ -#define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */ -#define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */ -/* - * Overlaps with KVM_ARM64_EXCEPT_MASK on purpose so that it can't be - * set together with an exception... - */ -#define KVM_ARM64_INCREMENT_PC (1 << 9) /* Increment PC */ -#define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */ -/* - * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can - * take the following values: - * - * For AArch32 EL1: - */ -#define KVM_ARM64_EXCEPT_AA32_UND (0 << 9) -#define KVM_ARM64_EXCEPT_AA32_IABT (1 << 9) -#define KVM_ARM64_EXCEPT_AA32_DABT (2 << 9) -/* For AArch64: */ -#define KVM_ARM64_EXCEPT_AA64_ELx_SYNC (0 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_IRQ (1 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_FIQ (2 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_SERR (3 << 9) -#define KVM_ARM64_EXCEPT_AA64_EL1 (0 << 11) -#define KVM_ARM64_EXCEPT_AA64_EL2 (1 << 11) - -#define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active */ -#define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */ -#define KVM_ARM64_FP_FOREIGN_FPSTATE (1 << 14) -#define KVM_ARM64_ON_UNSUPPORTED_CPU (1 << 15) /* Physical CPU not in supported_cpus */ -#define KVM_ARM64_HOST_SME_ENABLED (1 << 16) /* SME enabled for EL0 */ -#define KVM_ARM64_WFIT (1 << 17) /* WFIT instruction trapped */ - #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_USE_HW | \ KVM_GUESTDBG_SINGLESTEP) #define vcpu_has_sve(vcpu) (system_supports_sve() && \ - ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE)) + vcpu_get_flag(vcpu, GUEST_HAS_SVE)) #ifdef CONFIG_ARM64_PTR_AUTH #define vcpu_has_ptrauth(vcpu) \ ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || \ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && \ - (vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH) + vcpu_get_flag(vcpu, GUEST_HAS_PTRAUTH)) #else #define vcpu_has_ptrauth(vcpu) false #endif #define vcpu_on_unsupported_cpu(vcpu) \ - ((vcpu)->arch.flags & KVM_ARM64_ON_UNSUPPORTED_CPU) + vcpu_get_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_set_on_unsupported_cpu(vcpu) \ - ((vcpu)->arch.flags |= KVM_ARM64_ON_UNSUPPORTED_CPU) + vcpu_set_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_clear_on_unsupported_cpu(vcpu) \ - ((vcpu)->arch.flags &= ~KVM_ARM64_ON_UNSUPPORTED_CPU) + vcpu_clear_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_gp_regs(v) (&(v)->arch.ctxt.regs) @@ -620,8 +714,6 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu); int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); -int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); -int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); @@ -831,8 +923,7 @@ void kvm_init_protected_traps(struct kvm_vcpu *vcpu); int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); -#define kvm_arm_vcpu_sve_finalized(vcpu) \ - ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED) +#define kvm_arm_vcpu_sve_finalized(vcpu) vcpu_get_flag(vcpu, VCPU_SVE_FINALIZED) #define kvm_has_mte(kvm) \ (system_supports_mte() && \ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 227d256cd4b9..4e4c171f070b 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -114,6 +114,14 @@ #define OVERFLOW_STACK_SIZE SZ_4K /* + * With the minimum frame size of [x29, x30], exactly half the combined + * sizes of the hyp and overflow stacks is the maximum size needed to + * save the unwinded stacktrace; plus an additional entry to delimit the + * end. + */ +#define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + PAGE_SIZE) / 2 + sizeof(long)) + +/* * Alignment of kernel segments (e.g. .text, .data). * * 4 KB granule: 16 level 3 entries, with contiguous bit diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index aec9315bf156..6ebdcdff77f5 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -8,52 +8,20 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> -#include <linux/types.h> #include <linux/llist.h> #include <asm/memory.h> +#include <asm/pointer_auth.h> #include <asm/ptrace.h> #include <asm/sdei.h> -enum stack_type { - STACK_TYPE_UNKNOWN, - STACK_TYPE_TASK, - STACK_TYPE_IRQ, - STACK_TYPE_OVERFLOW, - STACK_TYPE_SDEI_NORMAL, - STACK_TYPE_SDEI_CRITICAL, - __NR_STACK_TYPES -}; - -struct stack_info { - unsigned long low; - unsigned long high; - enum stack_type type; -}; +#include <asm/stacktrace/common.h> extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); -static inline bool on_stack(unsigned long sp, unsigned long size, - unsigned long low, unsigned long high, - enum stack_type type, struct stack_info *info) -{ - if (!low) - return false; - - if (sp < low || sp + size < sp || sp + size > high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = type; - } - return true; -} - static inline bool on_irq_stack(unsigned long sp, unsigned long size, struct stack_info *info) { @@ -89,30 +57,4 @@ static inline bool on_overflow_stack(unsigned long sp, unsigned long size, struct stack_info *info) { return false; } #endif - -/* - * We can only safely access per-cpu stacks from current in a non-preemptible - * context. - */ -static inline bool on_accessible_stack(const struct task_struct *tsk, - unsigned long sp, unsigned long size, - struct stack_info *info) -{ - if (info) - info->type = STACK_TYPE_UNKNOWN; - - if (on_task_stack(tsk, sp, size, info)) - return true; - if (tsk != current || preemptible()) - return false; - if (on_irq_stack(sp, size, info)) - return true; - if (on_overflow_stack(sp, size, info)) - return true; - if (on_sdei_stack(sp, size, info)) - return true; - - return false; -} - #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h new file mode 100644 index 000000000000..f58eb944c46f --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common arm64 stack unwinder code. + * + * To implement a new arm64 stack unwinder: + * 1) Include this header + * + * 2) Call into unwind_next_common() from your top level unwind + * function, passing it the validation and translation callbacks + * (though the later can be NULL if no translation is required). + * + * See: arch/arm64/kernel/stacktrace.c for the reference implementation. + * + * Copyright (C) 2012 ARM Ltd. + */ +#ifndef __ASM_STACKTRACE_COMMON_H +#define __ASM_STACKTRACE_COMMON_H + +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/kprobes.h> +#include <linux/types.h> + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_OVERFLOW, + STACK_TYPE_SDEI_NORMAL, + STACK_TYPE_SDEI_CRITICAL, + STACK_TYPE_HYP, + __NR_STACK_TYPES +}; + +struct stack_info { + unsigned long low; + unsigned long high; + enum stack_type type; +}; + +/* + * A snapshot of a frame record or fp/lr register values, along with some + * accounting information necessary for robust unwinding. + * + * @fp: The fp value in the frame record (or the real fp) + * @pc: The lr value in the frame record (or the real lr) + * + * @stacks_done: Stacks which have been entirely unwound, for which it is no + * longer valid to unwind to. + * + * @prev_fp: The fp that pointed to this frame record, or a synthetic value + * of 0. This is used to ensure that within a stack, each + * subsequent frame record is at an increasing address. + * @prev_type: The type of stack this frame record was on, or a synthetic + * value of STACK_TYPE_UNKNOWN. This is used to detect a + * transition from one stack to another. + * + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement lr + * value. + * + * @task: The task being unwound. + */ +struct unwind_state { + unsigned long fp; + unsigned long pc; + DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); + unsigned long prev_fp; + enum stack_type prev_type; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif + struct task_struct *task; +}; + +static inline bool on_stack(unsigned long sp, unsigned long size, + unsigned long low, unsigned long high, + enum stack_type type, struct stack_info *info) +{ + if (!low) + return false; + + if (sp < low || sp + size < sp || sp + size > high) + return false; + + if (info) { + info->low = low; + info->high = high; + info->type = type; + } + return true; +} + +static inline void unwind_init_common(struct unwind_state *state, + struct task_struct *task) +{ + state->task = task; +#ifdef CONFIG_KRETPROBES + state->kr_cur = NULL; +#endif + + /* + * Prime the first unwind. + * + * In unwind_next() we'll check that the FP points to a valid stack, + * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be + * treated as a transition to whichever stack that happens to be. The + * prev_fp value won't be used, but we set it to 0 such that it is + * definitely not an accessible stack address. + */ + bitmap_zero(state->stacks_done, __NR_STACK_TYPES); + state->prev_fp = 0; + state->prev_type = STACK_TYPE_UNKNOWN; +} + +/* + * stack_trace_translate_fp_fn() - Translates a non-kernel frame pointer to + * a kernel address. + * + * @fp: the frame pointer to be updated to its kernel address. + * @type: the stack type associated with frame pointer @fp + * + * Returns true and success and @fp is updated to the corresponding + * kernel virtual address; otherwise returns false. + */ +typedef bool (*stack_trace_translate_fp_fn)(unsigned long *fp, + enum stack_type type); + +/* + * on_accessible_stack_fn() - Check whether a stack range is on any + * of the possible stacks. + * + * @tsk: task whose stack is being unwound + * @sp: stack address being checked + * @size: size of the stack range being checked + * @info: stack unwinding context + */ +typedef bool (*on_accessible_stack_fn)(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info); + +static inline int unwind_next_common(struct unwind_state *state, + struct stack_info *info, + on_accessible_stack_fn accessible, + stack_trace_translate_fp_fn translate_fp) +{ + unsigned long fp = state->fp, kern_fp = fp; + struct task_struct *tsk = state->task; + + if (fp & 0x7) + return -EINVAL; + + if (!accessible(tsk, fp, 16, info)) + return -EINVAL; + + if (test_bit(info->type, state->stacks_done)) + return -EINVAL; + + /* + * If fp is not from the current address space perform the necessary + * translation before dereferencing it to get the next fp. + */ + if (translate_fp && !translate_fp(&kern_fp, info->type)) + return -EINVAL; + + /* + * As stacks grow downward, any valid record on the same stack must be + * at a strictly higher address than the prior record. + * + * Stacks can nest in several valid orders, e.g. + * + * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL + * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW + * HYP -> OVERFLOW + * + * ... but the nesting itself is strict. Once we transition from one + * stack to another, it's never valid to unwind back to that first + * stack. + */ + if (info->type == state->prev_type) { + if (fp <= state->prev_fp) + return -EINVAL; + } else { + __set_bit(state->prev_type, state->stacks_done); + } + + /* + * Record this frame record's values and location. The prev_fp and + * prev_type are only meaningful to the next unwind_next() invocation. + */ + state->fp = READ_ONCE(*(unsigned long *)(kern_fp)); + state->pc = READ_ONCE(*(unsigned long *)(kern_fp + 8)); + state->prev_fp = fp; + state->prev_type = info->type; + + return 0; +} + +#endif /* __ASM_STACKTRACE_COMMON_H */ diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h new file mode 100644 index 000000000000..d5527b600390 --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * + * Copyright (C) 2022 Google LLC + */ +#ifndef __ASM_STACKTRACE_NVHE_H +#define __ASM_STACKTRACE_NVHE_H + +#include <asm/stacktrace/common.h> + +/* + * kvm_nvhe_unwind_init - Start an unwind from the given nVHE HYP fp and pc + * + * @state : unwind_state to initialize + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + */ +static inline void kvm_nvhe_unwind_init(struct unwind_state *state, + unsigned long fp, + unsigned long pc) +{ + unwind_init_common(state, NULL); + + state->fp = fp; + state->pc = pc; +} + +#ifndef __KVM_NVHE_HYPERVISOR__ +/* + * Conventional (non-protected) nVHE HYP stack unwinder + * + * In non-protected mode, the unwinding is done from kernel proper context + * (by the host in EL1). + */ + +DECLARE_KVM_NVHE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); +DECLARE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); + +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset); + +#endif /* __KVM_NVHE_HYPERVISOR__ */ +#endif /* __ASM_STACKTRACE_NVHE_H */ |