aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds2019-11-25 18:02:36 -0800
committerLinus Torvalds2019-11-25 18:02:36 -0800
commit752272f16dd18f2cac58a583a8673c8e2fb93abb (patch)
tree1a2bae3067e1133c1d1b8e0bbbaea8c34e3321b2
parent3f3c8be973af10875cfa1e7b85a535b6ba76b44f (diff)
parent96710247298df52a4b8150a62a6fe87083093ff3 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "ARM: - data abort report and injection - steal time support - GICv4 performance improvements - vgic ITS emulation fixes - simplify FWB handling - enable halt polling counters - make the emulated timer PREEMPT_RT compliant s390: - small fixes and cleanups - selftest improvements - yield improvements PPC: - add capability to tell userspace whether we can single-step the guest - improve the allocation of XIVE virtual processor IDs - rewrite interrupt synthesis code to deliver interrupts in virtual mode when appropriate. - minor cleanups and improvements. x86: - XSAVES support for AMD - more accurate report of nested guest TSC to the nested hypervisor - retpoline optimizations - support for nested 5-level page tables - PMU virtualization optimizations, and improved support for nested PMU virtualization - correct latching of INITs for nested virtualization - IOAPIC optimization - TSX_CTRL virtualization for more TAA happiness - improved allocation and flushing of SEV ASIDs - many bugfixes and cleanups" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (127 commits) kvm: nVMX: Relax guest IA32_FEATURE_CONTROL constraints KVM: x86: Grab KVM's srcu lock when setting nested state KVM: x86: Open code shared_msr_update() in its only caller KVM: Fix jump label out_free_* in kvm_init() KVM: x86: Remove a spurious export of a static function KVM: x86: create mmu/ subdirectory KVM: nVMX: Remove unnecessary TLB flushes on L1<->L2 switches when L1 use apic-access-page KVM: x86: remove set but not used variable 'called' KVM: nVMX: Do not mark vmcs02->apic_access_page as dirty when unpinning KVM: vmx: use MSR_IA32_TSX_CTRL to hard-disable TSX on guest that lack it KVM: vmx: implement MSR_IA32_TSX_CTRL disable RTM functionality KVM: x86: implement MSR_IA32_TSX_CTRL effect on CPUID KVM: x86: do not modify masked bits of shared MSRs KVM: x86: fix presentation of TSX feature in ARCH_CAPABILITIES KVM: PPC: Book3S HV: XIVE: Fix potential page leak on error path KVM: PPC: Book3S HV: XIVE: Free previous EQ page when setting up a new one KVM: nVMX: Assume TLB entries of L1 and L2 are tagged differently if L0 use EPT KVM: x86: Unexport kvm_vcpu_reload_apic_access_page() KVM: nVMX: add CR4_LA57 bit to nested CR4_FIXED1 KVM: nVMX: Use semi-colon instead of comma for exit-handlers initialization ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt6
-rw-r--r--Documentation/virt/kvm/api.txt58
-rw-r--r--Documentation/virt/kvm/arm/pvtime.rst80
-rw-r--r--Documentation/virt/kvm/devices/vcpu.txt14
-rw-r--r--Documentation/virt/kvm/devices/xics.txt14
-rw-r--r--Documentation/virt/kvm/devices/xive.txt8
-rw-r--r--arch/arm/include/asm/kvm_arm.h1
-rw-r--r--arch/arm/include/asm/kvm_emulate.h9
-rw-r--r--arch/arm/include/asm/kvm_host.h33
-rw-r--r--arch/arm/include/uapi/asm/kvm.h3
-rw-r--r--arch/arm/kvm/Makefile2
-rw-r--r--arch/arm/kvm/guest.c14
-rw-r--r--arch/arm/kvm/handle_exit.c2
-rw-r--r--arch/arm/mm/proc-v7-bugs.c13
-rw-r--r--arch/arm64/include/asm/kvm_arm.h3
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h26
-rw-r--r--arch/arm64/include/asm/kvm_host.h37
-rw-r--r--arch/arm64/include/asm/paravirt.h9
-rw-r--r--arch/arm64/include/asm/pvclock-abi.h17
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h5
-rw-r--r--arch/arm64/kernel/cpu_errata.c81
-rw-r--r--arch/arm64/kernel/paravirt.c140
-rw-r--r--arch/arm64/kernel/time.c3
-rw-r--r--arch/arm64/kvm/Kconfig4
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/guest.c23
-rw-r--r--arch/arm64/kvm/handle_exit.c4
-rw-r--r--arch/arm64/kvm/inject_fault.c4
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h1
-rw-r--r--arch/powerpc/include/asm/reg.h12
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h3
-rw-r--r--arch/powerpc/kvm/book3s.c27
-rw-r--r--arch/powerpc/kvm/book3s.h3
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c6
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c15
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c26
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c28
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c82
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c2
-rw-r--r--arch/powerpc/kvm/book3s_pr.c40
-rw-r--r--arch/powerpc/kvm/book3s_xive.c128
-rw-r--r--arch/powerpc/kvm/book3s_xive.h5
-rw-r--r--arch/powerpc/kvm/book3s_xive_native.c82
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c6
-rw-r--r--arch/powerpc/kvm/powerpc.c2
-rw-r--r--arch/s390/include/asm/kvm_host.h1
-rw-r--r--arch/s390/kvm/diag.c22
-rw-r--r--arch/s390/kvm/interrupt.c5
-rw-r--r--arch/s390/kvm/kvm-s390.c19
-rw-r--r--arch/x86/events/intel/core.c11
-rw-r--r--arch/x86/include/asm/kvm_host.h32
-rw-r--r--arch/x86/kernel/kvm.c1
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/cpuid.c8
-rw-r--r--arch/x86/kvm/emulate.c6
-rw-r--r--arch/x86/kvm/ioapic.c34
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h51
-rw-r--r--arch/x86/kvm/lapic.c111
-rw-r--r--arch/x86/kvm/lapic.h3
-rw-r--r--arch/x86/kvm/mmu/mmu.c (renamed from arch/x86/kvm/mmu.c)2
-rw-r--r--arch/x86/kvm/mmu/page_track.c (renamed from arch/x86/kvm/page_track.c)0
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h (renamed from arch/x86/kvm/paging_tmpl.h)0
-rw-r--r--arch/x86/kvm/pmu.c124
-rw-r--r--arch/x86/kvm/pmu.h29
-rw-r--r--arch/x86/kvm/pmu_amd.c24
-rw-r--r--arch/x86/kvm/svm.c140
-rw-r--r--arch/x86/kvm/vmx/nested.c252
-rw-r--r--arch/x86/kvm/vmx/nested.h9
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c34
-rw-r--r--arch/x86/kvm/vmx/vmx.c339
-rw-r--r--arch/x86/kvm/vmx/vmx.h12
-rw-r--r--arch/x86/kvm/x86.c244
-rw-r--r--arch/x86/kvm/x86.h15
-rw-r--r--drivers/crypto/ccp/psp-dev.c9
-rw-r--r--drivers/irqchip/irq-gic-v4.c7
-rw-r--r--include/Kbuild2
-rw-r--r--include/kvm/arm_hypercalls.h43
-rw-r--r--include/kvm/arm_psci.h2
-rw-r--r--include/kvm/arm_vgic.h8
-rw-r--r--include/linux/arm-smccc.h59
-rw-r--r--include/linux/cpuhotplug.h1
-rw-r--r--include/linux/irqchip/arm-gic-v4.h4
-rw-r--r--include/linux/kvm_host.h41
-rw-r--r--include/linux/kvm_types.h2
-rw-r--r--include/linux/perf_event.h10
-rw-r--r--include/uapi/linux/kvm.h11
-rw-r--r--kernel/events/core.c46
-rw-r--r--tools/testing/selftests/kvm/.gitignore1
-rw-r--r--tools/testing/selftests/kvm/Makefile1
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h7
-rw-r--r--tools/testing/selftests/kvm/kvm_create_max_vcpus.c7
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c72
-rw-r--r--tools/testing/selftests/kvm/s390x/sync_regs_test.c15
-rw-r--r--tools/testing/selftests/kvm/x86_64/xss_msr_test.c76
-rw-r--r--virt/kvm/arm/arch_timer.c8
-rw-r--r--virt/kvm/arm/arm.c49
-rw-r--r--virt/kvm/arm/hypercalls.c71
-rw-r--r--virt/kvm/arm/mmio.c9
-rw-r--r--virt/kvm/arm/psci.c84
-rw-r--r--virt/kvm/arm/pvtime.c131
-rw-r--r--virt/kvm/arm/vgic/vgic-init.c1
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c3
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c12
-rw-r--r--virt/kvm/arm/vgic/vgic-v4.c59
-rw-r--r--virt/kvm/arm/vgic/vgic.c4
-rw-r--r--virt/kvm/arm/vgic/vgic.h2
-rw-r--r--virt/kvm/coalesced_mmio.c8
-rw-r--r--virt/kvm/kvm_main.c34
110 files changed, 2593 insertions, 924 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8dee8f68fe15..9b847f06cb19 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3110,9 +3110,9 @@
[X86,PV_OPS] Disable paravirtualized VMware scheduler
clock and use the default one.
- no-steal-acc [X86,KVM] Disable paravirtualized steal time accounting.
- steal time is computed, but won't influence scheduler
- behaviour
+ no-steal-acc [X86,KVM,ARM64] Disable paravirtualized steal time
+ accounting. steal time is computed, but won't
+ influence scheduler behaviour
nolapic [X86-32,APIC] Do not enable or use the local APIC.
diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt
index 4833904d32a5..49183add44e7 100644
--- a/Documentation/virt/kvm/api.txt
+++ b/Documentation/virt/kvm/api.txt
@@ -1002,12 +1002,18 @@ Specifying exception.has_esr on a system that does not support it will return
-EINVAL. Setting anything other than the lower 24bits of exception.serror_esr
will return -EINVAL.
+It is not possible to read back a pending external abort (injected via
+KVM_SET_VCPU_EVENTS or otherwise) because such an exception is always delivered
+directly to the virtual CPU).
+
+
struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+ __u8 ext_dabt_pending;
/* Align it to 8 bytes */
- __u8 pad[6];
+ __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
@@ -1051,9 +1057,23 @@ contain a valid state and shall be written into the VCPU.
ARM/ARM64:
+User space may need to inject several types of events to the guest.
+
Set the pending SError exception state for this VCPU. It is not possible to
'cancel' an Serror that has been made pending.
+If the guest performed an access to I/O memory which could not be handled by
+userspace, for example because of missing instruction syndrome decode
+information or because there is no device mapped at the accessed IPA, then
+userspace can ask the kernel to inject an external abort using the address
+from the exiting fault on the VCPU. It is a programming error to set
+ext_dabt_pending after an exit which was not either KVM_EXIT_MMIO or
+KVM_EXIT_ARM_NISV. This feature is only available if the system supports
+KVM_CAP_ARM_INJECT_EXT_DABT. This is a helper which provides commonality in
+how userspace reports accesses for the above cases to guests, across different
+userspace implementations. Nevertheless, userspace can still emulate all Arm
+exceptions by manipulating individual registers using the KVM_SET_ONE_REG API.
+
See KVM_GET_VCPU_EVENTS for the data structure.
@@ -2982,6 +3002,9 @@ can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and
KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number
indicating the number of supported registers.
+For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether
+the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported.
+
When debug events exit the main run loop with the reason
KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
structure containing architecture specific debug information.
@@ -4468,6 +4491,39 @@ Hyper-V SynIC state change. Notification is used to remap SynIC
event/message pages and to enable/disable SynIC messages/events processing
in userspace.
+ /* KVM_EXIT_ARM_NISV */
+ struct {
+ __u64 esr_iss;
+ __u64 fault_ipa;
+ } arm_nisv;
+
+Used on arm and arm64 systems. If a guest accesses memory not in a memslot,
+KVM will typically return to userspace and ask it to do MMIO emulation on its
+behalf. However, for certain classes of instructions, no instruction decode
+(direction, length of memory access) is provided, and fetching and decoding
+the instruction from the VM is overly complicated to live in the kernel.
+
+Historically, when this situation occurred, KVM would print a warning and kill
+the VM. KVM assumed that if the guest accessed non-memslot memory, it was
+trying to do I/O, which just couldn't be emulated, and the warning message was
+phrased accordingly. However, what happened more often was that a guest bug
+caused access outside the guest memory areas which should lead to a more
+meaningful warning message and an external abort in the guest, if the access
+did not fall within an I/O window.
+
+Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and enable
+this capability at VM creation. Once this is done, these types of errors will
+instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits from
+the HSR (arm) and ESR_EL2 (arm64) in the esr_iss field, and the faulting IPA
+in the fault_ipa field. Userspace can either fix up the access if it's
+actually an I/O access by decoding the instruction from guest memory (if it's
+very brave) and continue executing the guest, or it can decide to suspend,
+dump, or restart the guest.
+
+Note that KVM does not skip the faulting instruction as it does for
+KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state
+if it decides to decode and emulate the instruction.
+
/* Fix the size of the union. */
char padding[256];
};
diff --git a/Documentation/virt/kvm/arm/pvtime.rst b/Documentation/virt/kvm/arm/pvtime.rst
new file mode 100644
index 000000000000..2357dd2d8655
--- /dev/null
+++ b/Documentation/virt/kvm/arm/pvtime.rst
@@ -0,0 +1,80 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Paravirtualized time support for arm64
+======================================
+
+Arm specification DEN0057/A defines a standard for paravirtualised time
+support for AArch64 guests:
+
+https://developer.arm.com/docs/den0057/a
+
+KVM/arm64 implements the stolen time part of this specification by providing
+some hypervisor service calls to support a paravirtualized guest obtaining a
+view of the amount of time stolen from its execution.
+
+Two new SMCCC compatible hypercalls are defined:
+
+* PV_TIME_FEATURES: 0xC5000020
+* PV_TIME_ST: 0xC5000021
+
+These are only available in the SMC64/HVC64 calling convention as
+paravirtualized time is not available to 32 bit Arm guests. The existence of
+the PV_FEATURES hypercall should be probed using the SMCCC 1.1 ARCH_FEATURES
+mechanism before calling it.
+
+PV_TIME_FEATURES
+ ============= ======== ==========
+ Function ID: (uint32) 0xC5000020
+ PV_call_id: (uint32) The function to query for support.
+ Currently only PV_TIME_ST is supported.
+ Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the relevant
+ PV-time feature is supported by the hypervisor.
+ ============= ======== ==========
+
+PV_TIME_ST
+ ============= ======== ==========
+ Function ID: (uint32) 0xC5000021
+ Return value: (int64) IPA of the stolen time data structure for this
+ VCPU. On failure:
+ NOT_SUPPORTED (-1)
+ ============= ======== ==========
+
+The IPA returned by PV_TIME_ST should be mapped by the guest as normal memory
+with inner and outer write back caching attributes, in the inner shareable
+domain. A total of 16 bytes from the IPA returned are guaranteed to be
+meaningfully filled by the hypervisor (see structure below).
+
+PV_TIME_ST returns the structure for the calling VCPU.
+
+Stolen Time
+-----------
+
+The structure pointed to by the PV_TIME_ST hypercall is as follows:
+
++-------------+-------------+-------------+----------------------------+
+| Field | Byte Length | Byte Offset | Description |
++=============+=============+=============+============================+
+| Revision | 4 | 0 | Must be 0 for version 1.0 |
++-------------+-------------+-------------+----------------------------+
+| Attributes | 4 | 4 | Must be 0 |
++-------------+-------------+-------------+----------------------------+
+| Stolen time | 8 | 8 | Stolen time in unsigned |
+| | | | nanoseconds indicating how |
+| | | | much time this VCPU thread |
+| | | | was involuntarily not |
+| | | | running on a physical CPU. |
++-------------+-------------+-------------+----------------------------+
+
+All values in the structure are stored little-endian.
+
+The structure will be updated by the hypervisor prior to scheduling a VCPU. It
+will be present within a reserved region of the normal memory given to the
+guest. The guest should not attempt to write into this memory. There is a
+structure per VCPU of the guest.
+
+It is advisable that one or more 64k pages are set aside for the purpose of
+these structures and not used for other purposes, this enables the guest to map
+the region using 64k pages and avoids conflicting attributes with other memory.
+
+For the user space interface see Documentation/virt/kvm/devices/vcpu.txt
+section "3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL".
diff --git a/Documentation/virt/kvm/devices/vcpu.txt b/Documentation/virt/kvm/devices/vcpu.txt
index 2b5dab16c4f2..6f3bd64a05b0 100644
--- a/Documentation/virt/kvm/devices/vcpu.txt
+++ b/Documentation/virt/kvm/devices/vcpu.txt
@@ -60,3 +60,17 @@ time to use the number provided for a given timer, overwriting any previously
configured values on other VCPUs. Userspace should configure the interrupt
numbers on at least one VCPU after creating all VCPUs and before running any
VCPUs.
+
+3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL
+Architectures: ARM64
+
+3.1 ATTRIBUTE: KVM_ARM_VCPU_PVTIME_IPA
+Parameters: 64-bit base address
+Returns: -ENXIO: Stolen time not implemented
+ -EEXIST: Base address already set for this VCPU
+ -EINVAL: Base address not 64 byte aligned
+
+Specifies the base address of the stolen time structure for this VCPU. The
+base address must be 64 byte aligned and exist within a valid guest memory
+region. See Documentation/virt/kvm/arm/pvtime.txt for more information
+including the layout of the stolen time structure.
diff --git a/Documentation/virt/kvm/devices/xics.txt b/Documentation/virt/kvm/devices/xics.txt
index 42864935ac5d..423332dda7bc 100644
--- a/Documentation/virt/kvm/devices/xics.txt
+++ b/Documentation/virt/kvm/devices/xics.txt
@@ -3,9 +3,19 @@ XICS interrupt controller
Device type supported: KVM_DEV_TYPE_XICS
Groups:
- KVM_DEV_XICS_SOURCES
+ 1. KVM_DEV_XICS_GRP_SOURCES
Attributes: One per interrupt source, indexed by the source number.
+ 2. KVM_DEV_XICS_GRP_CTRL
+ Attributes:
+ 2.1 KVM_DEV_XICS_NR_SERVERS (write only)
+ The kvm_device_attr.addr points to a __u32 value which is the number of
+ interrupt server numbers (ie, highest possible vcpu id plus one).
+ Errors:
+ -EINVAL: Value greater than KVM_MAX_VCPU_ID.
+ -EFAULT: Invalid user pointer for attr->addr.
+ -EBUSY: A vcpu is already connected to the device.
+
This device emulates the XICS (eXternal Interrupt Controller
Specification) defined in PAPR. The XICS has a set of interrupt
sources, each identified by a 20-bit source number, and a set of
@@ -38,7 +48,7 @@ least-significant end of the word:
Each source has 64 bits of state that can be read and written using
the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the
-KVM_DEV_XICS_SOURCES attribute group, with the attribute number being
+KVM_DEV_XICS_GRP_SOURCES attribute group, with the attribute number being
the interrupt source number. The 64 bit state word has the following
bitfields, starting from the least-significant end of the word:
diff --git a/Documentation/virt/kvm/devices/xive.txt b/Documentation/virt/kvm/devices/xive.txt
index 9a24a4525253..f5d1d6b5af61 100644
--- a/Documentation/virt/kvm/devices/xive.txt
+++ b/Documentation/virt/kvm/devices/xive.txt
@@ -78,6 +78,14 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
migrating the VM.
Errors: none
+ 1.3 KVM_DEV_XIVE_NR_SERVERS (write only)
+ The kvm_device_attr.addr points to a __u32 value which is the number of
+ interrupt server numbers (ie, highest possible vcpu id plus one).
+ Errors:
+ -EINVAL: Value greater than KVM_MAX_VCPU_ID.
+ -EFAULT: Invalid user pointer for attr->addr.
+ -EBUSY: A vCPU is already connected to the device.
+
2. KVM_DEV_XIVE_GRP_SOURCE (write only)
Initializes a new source in the XIVE device and mask it.
Attributes:
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 0125aa059d5b..9c04bd810d07 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -162,6 +162,7 @@
#define HSR_ISV (_AC(1, UL) << HSR_ISV_SHIFT)
#define HSR_SRT_SHIFT (16)
#define HSR_SRT_MASK (0xf << HSR_SRT_SHIFT)
+#define HSR_CM (1 << 8)
#define HSR_FSC (0x3f)
#define HSR_FSC_TYPE (0x3c)
#define HSR_SSE (1 << 21)
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 40002416efec..9b118516d2db 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -95,12 +95,12 @@ static inline unsigned long *vcpu_hcr(const struct kvm_vcpu *vcpu)
return (unsigned long *)&vcpu->arch.hcr;
}
-static inline void vcpu_clear_wfe_traps(struct kvm_vcpu *vcpu)
+static inline void vcpu_clear_wfx_traps(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr &= ~HCR_TWE;
}
-static inline void vcpu_set_wfe_traps(struct kvm_vcpu *vcpu)
+static inline void vcpu_set_wfx_traps(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr |= HCR_TWE;
}
@@ -167,6 +167,11 @@ static inline bool kvm_vcpu_dabt_isvalid(struct kvm_vcpu *vcpu)
return kvm_vcpu_get_hsr(vcpu) & HSR_ISV;
}
+static inline unsigned long kvm_vcpu_dabt_iss_nisv_sanitized(const struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_get_hsr(vcpu) & (HSR_CM | HSR_WNR | HSR_FSC);
+}
+
static inline bool kvm_vcpu_dabt_iswrite(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_get_hsr(vcpu) & HSR_WNR;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 8a37c8e89777..556cd818eccf 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -7,6 +7,7 @@
#ifndef __ARM_KVM_HOST_H__
#define __ARM_KVM_HOST_H__
+#include <linux/arm-smccc.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/kvm_types.h>
@@ -38,6 +39,7 @@
KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1)
#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
+#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
@@ -76,6 +78,14 @@ struct kvm_arch {
/* Mandated version of PSCI */
u32 psci_version;
+
+ /*
+ * If we encounter a data abort without valid instruction syndrome
+ * information, report this to user space. User space can (and
+ * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
+ * supported.
+ */
+ bool return_nisv_io_abort_to_user;
};
#define KVM_NR_MEM_OBJS 40
@@ -323,6 +333,29 @@ static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
int kvm_perf_init(void);
int kvm_perf_teardown(void);
+static inline long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
+{
+ return SMCCC_RET_NOT_SUPPORTED;
+}
+
+static inline gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
+{
+ return GPA_INVALID;
+}
+
+static inline void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
+{
+}
+
+static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
+{
+ return false;
+}
+
void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 2769360f195c..03cd7c19a683 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -131,8 +131,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+ __u8 ext_dabt_pending;
/* Align it to 8 bytes */
- __u8 pad[6];
+ __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index b76b75bd9e00..e442d82821df 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -24,7 +24,7 @@ obj-y += kvm-arm.o init.o interrupts.o
obj-y += handle_exit.o guest.o emulate.o reset.o
obj-y += coproc.o coproc_a15.o coproc_a7.o vgic-v3-coproc.o
obj-y += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
-obj-y += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
+obj-y += $(KVM)/arm/psci.o $(KVM)/arm/perf.o $(KVM)/arm/hypercalls.o
obj-y += $(KVM)/arm/aarch32.o
obj-y += $(KVM)/arm/vgic/vgic.o
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 684cf64b4033..0e6f23504c26 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -21,6 +21,10 @@
#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU }
struct kvm_stats_debugfs_item debugfs_entries[] = {
+ VCPU_STAT(halt_successful_poll),
+ VCPU_STAT(halt_attempted_poll),
+ VCPU_STAT(halt_poll_invalid),
+ VCPU_STAT(halt_wakeup),
VCPU_STAT(hvc_exit_stat),
VCPU_STAT(wfe_exit_stat),
VCPU_STAT(wfi_exit_stat),
@@ -255,6 +259,12 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
{
events->exception.serror_pending = !!(*vcpu_hcr(vcpu) & HCR_VA);
+ /*
+ * We never return a pending ext_dabt here because we deliver it to
+ * the virtual CPU directly when setting the event and it's no longer
+ * 'pending' at this point.
+ */
+
return 0;
}
@@ -263,12 +273,16 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
{
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
+ bool ext_dabt_pending = events->exception.ext_dabt_pending;
if (serror_pending && has_esr)
return -EINVAL;
else if (serror_pending)
kvm_inject_vabt(vcpu);
+ if (ext_dabt_pending)
+ kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+
return 0;
}
diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 2a6a1394d26e..e58a89d2f13f 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -9,7 +9,7 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_coproc.h>
#include <asm/kvm_mmu.h>
-#include <kvm/arm_psci.h>
+#include <kvm/arm_hypercalls.h>
#include <trace/events/kvm.h>
#include "trace.h"
diff --git a/arch/arm/mm/proc-v7-bugs.c b/arch/arm/mm/proc-v7-bugs.c
index 54d87506d3b5..7c90b4c615a5 100644
--- a/arch/arm/mm/proc-v7-bugs.c
+++ b/arch/arm/mm/proc-v7-bugs.c
@@ -74,12 +74,13 @@ static void cpu_v7_spectre_init(void)
case ARM_CPU_PART_CORTEX_A72: {
struct arm_smccc_res res;
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_WORKAROUND_1, &res);
+ if ((int)res.a0 != 0)
+ return;
+
switch (arm_smccc_1_1_get_conduit()) {
case SMCCC_CONDUIT_HVC:
- arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_1, &res);
- if ((int)res.a0 != 0)
- break;
per_cpu(harden_branch_predictor_fn, cpu) =
call_hvc_arch_workaround_1;
cpu_do_switch_mm = cpu_v7_hvc_switch_mm;
@@ -87,10 +88,6 @@ static void cpu_v7_spectre_init(void)
break;
case SMCCC_CONDUIT_SMC:
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_1, &res);
- if ((int)res.a0 != 0)
- break;
per_cpu(harden_branch_predictor_fn, cpu) =
call_smc_arch_workaround_1;
cpu_do_switch_mm = cpu_v7_smc_switch_mm;
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index ddf9d762ac62..6e5d839f42b5 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -61,7 +61,6 @@
* RW: 64bit by default, can be overridden for 32bit VMs
* TAC: Trap ACTLR
* TSC: Trap SMC
- * TVM: Trap VM ops (until M+C set in SCTLR_EL1)
* TSW: Trap cache operations by set/way
* TWE: Trap WFE
* TWI: Trap WFI
@@ -74,7 +73,7 @@
* SWIO: Turn set/way invalidates into set/way clean+invalidate
*/
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
- HCR_TVM | HCR_BSU_IS | HCR_FB | HCR_TAC | \
+ HCR_BSU_IS | HCR_FB | HCR_TAC | \
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
HCR_FMO | HCR_IMO)
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index d69c1efc63e7..5efe5ca8fecf 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -53,8 +53,18 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
/* trap error record accesses */
vcpu->arch.hcr_el2 |= HCR_TERR;
}
- if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
vcpu->arch.hcr_el2 |= HCR_FWB;
+ } else {
+ /*
+ * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C
+ * get set in SCTLR_EL1 such that we can detect when the guest
+ * MMU gets turned on and do the necessary cache maintenance
+ * then.
+ */
+ vcpu->arch.hcr_el2 |= HCR_TVM;
+ }
if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
vcpu->arch.hcr_el2 &= ~HCR_RW;
@@ -77,14 +87,19 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
return (unsigned long *)&vcpu->arch.hcr_el2;
}
-static inline void vcpu_clear_wfe_traps(struct kvm_vcpu *vcpu)
+static inline void vcpu_clear_wfx_traps(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr_el2 &= ~HCR_TWE;
+ if (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count))
+ vcpu->arch.hcr_el2 &= ~HCR_TWI;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TWI;
}
-static inline void vcpu_set_wfe_traps(struct kvm_vcpu *vcpu)
+static inline void vcpu_set_wfx_traps(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr_el2 |= HCR_TWE;
+ vcpu->arch.hcr_el2 |= HCR_TWI;
}
static inline void vcpu_ptrauth_enable(struct kvm_vcpu *vcpu)
@@ -258,6 +273,11 @@ static inline bool kvm_vcpu_dabt_isvalid(const struct kvm_vcpu *vcpu)
return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_ISV);
}
+static inline unsigned long kvm_vcpu_dabt_iss_nisv_sanitized(const struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_get_hsr(vcpu) & (ESR_ELx_CM | ESR_ELx_WNR | ESR_ELx_FSC);
+}
+
static inline bool kvm_vcpu_dabt_issext(const struct kvm_vcpu *vcpu)
{
return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SSE);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 5ecb091c8576..c61260cf63c5 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -44,6 +44,7 @@
KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1)
#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
+#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
@@ -83,6 +84,14 @@ struct kvm_arch {
/* Mandated version of PSCI */
u32 psci_version;
+
+ /*
+ * If we encounter a data abort without valid instruction syndrome
+ * information, report this to user space. User space can (and
+ * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
+ * supported.
+ */
+ bool return_nisv_io_abort_to_user;
};
#define KVM_NR_MEM_OBJS 40
@@ -338,6 +347,13 @@ struct kvm_vcpu_arch {
/* True when deferrable sysregs are loaded on the physical CPU,
* see kvm_vcpu_load_sysregs and kvm_vcpu_put_sysregs. */
bool sysregs_loaded_on_cpu;
+
+ /* Guest PV state */
+ struct {
+ u64 steal;
+ u64 last_steal;
+ gpa_t base;
+ } steal;
};
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
@@ -478,6 +494,27 @@ void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
int kvm_perf_init(void);
int kvm_perf_teardown(void);
+long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu);
+gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu);
+void kvm_update_stolen_time(struct kvm_vcpu *vcpu);
+
+int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr);
+int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr);
+int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr);
+
+static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
+{
+ vcpu_arch->steal.base = GPA_INVALID;
+}
+
+static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
+{
+ return (vcpu_arch->steal.base != GPA_INVALID);
+}
+
void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h
index 799d9dd6f7cc..cf3a0fd7c1a7 100644
--- a/arch/arm64/include/asm/paravirt.h
+++ b/arch/arm64/include/asm/paravirt.h
@@ -21,6 +21,13 @@ static inline u64 paravirt_steal_clock(int cpu)
{
return pv_ops.time.steal_clock(cpu);
}
-#endif
+
+int __init pv_time_init(void);
+
+#else
+
+#define pv_time_init() do {} while (0)
+
+#endif // CONFIG_PARAVIRT
#endif
diff --git a/arch/arm64/include/asm/pvclock-abi.h b/arch/arm64/include/asm/pvclock-abi.h
new file mode 100644
index 000000000000..c4f1c0a0789c
--- /dev/null
+++ b/arch/arm64/include/asm/pvclock-abi.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019 Arm Ltd. */
+
+#ifndef __ASM_PVCLOCK_ABI_H
+#define __ASM_PVCLOCK_ABI_H
+
+/* The below structure is defined in ARM DEN0057A */
+
+struct pvclock_vcpu_stolen_time {
+ __le32 revision;
+ __le32 attributes;
+ __le64 stolen_time;
+ /* Structure must be 64 byte aligned, pad to that size */
+ u8 padding[48];
+} __packed;
+
+#endif
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 67c21f9bdbad..820e5751ada7 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -164,8 +164,9 @@ struct kvm_vcpu_events {
struct {
__u8 serror_pending;
__u8 serror_has_esr;
+ __u8 ext_dabt_pending;
/* Align it to 8 bytes */
- __u8 pad[6];
+ __u8 pad[5];
__u64 serror_esr;
} exception;
__u32 reserved[12];
@@ -323,6 +324,8 @@ struct kvm_vcpu_events {
#define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
+#define KVM_ARM_VCPU_PVTIME_CTRL 2
+#define KVM_ARM_VCPU_PVTIME_IPA 0
/* KVM_IRQ_LINE irq field index values */
#define KVM_ARM_IRQ_VCPU2_SHIFT 28
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index bbdc95ee4642..6a09ca7644ea 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -218,40 +218,31 @@ static int detect_harden_bp_fw(void)
struct arm_smccc_res res;
u32 midr = read_cpuid_id();
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_WORKAROUND_1, &res);
+
+ switch ((int)res.a0) {
+ case 1:
+ /* Firmware says we're just fine */
+ return 0;
+ case 0:
+ break;
+ default:
+ return -1;
+ }
+
switch (arm_smccc_1_1_get_conduit()) {
case SMCCC_CONDUIT_HVC:
- arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_1, &res);
- switch ((int)res.a0) {
- case 1:
- /* Firmware says we're just fine */
- return 0;
- case 0:
- cb = call_hvc_arch_workaround_1;
- /* This is a guest, no need to patch KVM vectors */
- smccc_start = NULL;
- smccc_end = NULL;
- break;
- default:
- return -1;
- }
+ cb = call_hvc_arch_workaround_1;
+ /* This is a guest, no need to patch KVM vectors */
+ smccc_start = NULL;
+ smccc_end = NULL;
break;
case SMCCC_CONDUIT_SMC:
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_1, &res);
- switch ((int)res.a0) {
- case 1:
- /* Firmware says we're just fine */
- return 0;
- case 0:
- cb = call_smc_arch_workaround_1;
- smccc_start = __smccc_workaround_1_smc_start;
- smccc_end = __smccc_workaround_1_smc_end;
- break;
- default:
- return -1;
- }
+ cb = call_smc_arch_workaround_1;
+ smccc_start = __smccc_workaround_1_smc_start;
+ smccc_end = __smccc_workaround_1_smc_end;
break;
default:
@@ -341,6 +332,8 @@ void __init arm64_enable_wa2_handling(struct alt_instr *alt,
void arm64_set_ssbd_mitigation(bool state)
{
+ int conduit;
+
if (!IS_ENABLED(CONFIG_ARM64_SSBD)) {
pr_info_once("SSBD disabled by kernel configuration\n");
return;
@@ -354,19 +347,10 @@ void arm64_set_ssbd_mitigation(bool state)
return;
}
- switch (arm_smccc_1_1_get_conduit()) {
- case SMCCC_CONDUIT_HVC:
- arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_2, state, NULL);
- break;
-
- case SMCCC_CONDUIT_SMC:
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, state, NULL);
- break;
+ conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, state,
+ NULL);
- default:
- WARN_ON_ONCE(1);
- break;
- }
+ WARN_ON_ONCE(conduit == SMCCC_CONDUIT_NONE);
}
static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
@@ -376,6 +360,7 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
bool required = true;
s32 val;
bool this_cpu_safe = false;
+ int conduit;
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
@@ -393,18 +378,10 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
goto out_printmsg;
}
- switch (arm_smccc_1_1_get_conduit()) {
- case SMCCC_CONDUIT_HVC:
- arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_2, &res);
- break;
+ conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_WORKAROUND_2, &res);
- case SMCCC_CONDUIT_SMC:
- arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
- ARM_SMCCC_ARCH_WORKAROUND_2, &res);
- break;
-
- default:
+ if (conduit == SMCCC_CONDUIT_NONE) {
ssbd_state = ARM64_SSBD_UNKNOWN;
if (!this_cpu_safe)
__ssb_safe = false;
diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
index 4cfed91fe256..1ef702b0be2d 100644
--- a/arch/arm64/kernel/paravirt.c
+++ b/arch/arm64/kernel/paravirt.c
@@ -6,13 +6,153 @@
* Author: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
*/
+#define pr_fmt(fmt) "arm-pv: " fmt
+
+#include <linux/arm-smccc.h>
+#include <linux/cpuhotplug.h>
#include <linux/export.h>
+#include <linux/io.h>
#include <linux/jump_label.h>
+#include <linux/printk.h>
+#include <linux/psci.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
#include <linux/types.h>
+
#include <asm/paravirt.h>
+#include <asm/pvclock-abi.h>
+#include <asm/smp_plat.h>
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
struct paravirt_patch_template pv_ops;
EXPORT_SYMBOL_GPL(pv_ops);
+
+struct pv_time_stolen_time_region {
+ struct pvclock_vcpu_stolen_time *kaddr;
+};
+
+static DEFINE_PER_CPU(struct pv_time_stolen_time_region, stolen_time_region);
+
+static bool steal_acc = true;
+static int __init parse_no_stealacc(char *arg)
+{
+ steal_acc = false;
+ return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
+/* return stolen time in ns by asking the hypervisor */
+static u64 pv_steal_clock(int cpu)
+{
+ struct pv_time_stolen_time_region *reg;
+
+ reg = per_cpu_ptr(&stolen_time_region, cpu);
+ if (!reg->kaddr) {
+ pr_warn_once("stolen time enabled but not configured for cpu %d\n",
+ cpu);
+ return 0;
+ }
+
+ return le64_to_cpu(READ_ONCE(reg->kaddr->stolen_time));
+}
+
+static int stolen_time_dying_cpu(unsigned int cpu)
+{
+ struct pv_time_stolen_time_region *reg;
+
+ reg = this_cpu_ptr(&stolen_time_region);
+ if (!reg->kaddr)
+ return 0;
+
+ memunmap(reg->kaddr);
+ memset(reg, 0, sizeof(*reg));
+
+ return 0;
+}
+
+static int init_stolen_time_cpu(unsigned int cpu)
+{
+ struct pv_time_stolen_time_region *reg;
+ struct arm_smccc_res res;
+
+ reg = this_cpu_ptr(&stolen_time_region);
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_ST, &res);
+
+ if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
+ return -EINVAL;
+
+ reg->kaddr = memremap(res.a0,
+ sizeof(struct pvclock_vcpu_stolen_time),
+ MEMREMAP_WB);
+
+ if (!reg->kaddr) {
+ pr_warn("Failed to map stolen time data structure\n");
+ return -ENOMEM;
+ }
+
+ if (le32_to_cpu(reg->kaddr->revision) != 0 ||
+ le32_to_cpu(reg->kaddr->attributes) != 0) {
+ pr_warn_once("Unexpected revision or attributes in stolen time data\n");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int pv_time_init_stolen_time(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ARM_KVMPV_STARTING,
+ "hypervisor/arm/pvtime:starting",
+ init_stolen_time_cpu, stolen_time_dying_cpu);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+static bool has_pv_steal_clock(void)
+{
+ struct arm_smccc_res res;
+
+ /* To detect the presence of PV time support we require SMCCC 1.1+ */
+ if (psci_ops.smccc_version < SMCCC_VERSION_1_1)
+ return false;
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_HV_PV_TIME_FEATURES, &res);
+
+ if (res.a0 != SMCCC_RET_SUCCESS)
+ return false;
+
+ arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_TIME_FEATURES,
+ ARM_SMCCC_HV_PV_TIME_ST, &res);
+
+ return (res.a0 == SMCCC_RET_SUCCESS);
+}
+
+int __init pv_time_init(void)
+{
+ int ret;
+
+ if (!has_pv_steal_clock())
+ return 0;
+
+ ret = pv_time_init_stolen_time();
+ if (ret)
+ return ret;
+
+ pv_ops.time.steal_clock = pv_steal_clock;
+
+ static_key_slow_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
+
+ pr_info("using stolen time PV\n");
+
+ return 0;
+}
diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index 0b2946414dc9..73f06d4b3aae 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -30,6 +30,7 @@
#include <asm/thread_info.h>
#include <asm/stacktrace.h>
+#include <asm/paravirt.h>
unsigned long profile_pc(struct pt_regs *regs)
{
@@ -65,4 +66,6 @@ void __init time_init(void)
/* Calibrate the delay loop directly */
lpj_fine = arch_timer_rate / HZ;
+
+ pv_time_init();
}
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index a67121d419a2..a475c68cbfec 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -21,6 +21,8 @@ if VIRTUALIZATION
config KVM
bool "Kernel-based Virtual Machine (KVM) support"
depends on OF
+ # for TASKSTATS/TASK_DELAY_ACCT:
+ depends on NET && MULTIUSER
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
@@ -39,6 +41,8 @@ config KVM
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_VCPU_RUN_PID_CHANGE
+ select TASKSTATS
+ select TASK_DELAY_ACCT
---help---
Support hosting virtualized guest machines.
We don't support KVM with 16K page tables yet, due to the multiple
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 3ac1a64d2fb9..5ffbdc39e780 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -13,6 +13,8 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hypercalls.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/pvtime.o
kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o
kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index dfd626447482..2fff06114a8f 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -34,6 +34,10 @@
#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU }
struct kvm_stats_debugfs_item debugfs_entries[] = {
+ VCPU_STAT(halt_successful_poll),
+ VCPU_STAT(halt_attempted_poll),
+ VCPU_STAT(halt_poll_invalid),
+ VCPU_STAT(halt_wakeup),
VCPU_STAT(hvc_exit_stat),
VCPU_STAT(wfe_exit_stat),
VCPU_STAT(wfi_exit_stat),
@@ -712,6 +716,12 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
if (events->exception.serror_pending && events->exception.serror_has_esr)
events->exception.serror_esr = vcpu_get_vsesr(vcpu);
+ /*
+ * We never return a pending ext_dabt here because we deliver it to
+ * the virtual CPU directly when setting the event and it's no longer
+ * 'pending' at this point.
+ */
+
return 0;
}
@@ -720,6 +730,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
{
bool serror_pending = events->exception.serror_pending;
bool has_esr = events->exception.serror_has_esr;
+ bool ext_dabt_pending = events->exception.ext_dabt_pending;
if (serror_pending && has_esr) {
if (!cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
@@ -733,6 +744,9 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
kvm_inject_vabt(vcpu);
}
+ if (ext_dabt_pending)
+ kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+
return 0;
}
@@ -858,6 +872,9 @@ int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
case KVM_ARM_VCPU_TIMER_CTRL:
ret = kvm_arm_timer_set_attr(vcpu, attr);
break;
+ case KVM_ARM_VCPU_PVTIME_CTRL:
+ ret = kvm_arm_pvtime_set_attr(vcpu, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -878,6 +895,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
case KVM_ARM_VCPU_TIMER_CTRL:
ret = kvm_arm_timer_get_attr(vcpu, attr);
break;
+ case KVM_ARM_VCPU_PVTIME_CTRL:
+ ret = kvm_arm_pvtime_get_attr(vcpu, attr);
+ break;
default:
ret = -ENXIO;
break;
@@ -898,6 +918,9 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
case KVM_ARM_VCPU_TIMER_CTRL:
ret = kvm_arm_timer_has_attr(vcpu, attr);
break;
+ case KVM_ARM_VCPU_PVTIME_CTRL:
+ ret = kvm_arm_pvtime_has_attr(vcpu, attr);
+ break;
default:
ret = -ENXIO;
break;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 706cca23f0d2..aacfc55de44c 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -11,8 +11,6 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
-#include <kvm/arm_psci.h>
-
#include <asm/esr.h>
#include <asm/exception.h>
#include <asm/kvm_asm.h>
@@ -22,6 +20,8 @@
#include <asm/debug-monitors.h>
#include <asm/traps.h>
+#include <kvm/arm_hypercalls.h>
+
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index a9d25a305af5..ccdb6a051ab2 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -109,7 +109,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
/**
* kvm_inject_dabt - inject a data abort into the guest
- * @vcpu: The VCPU to receive the undefined exception
+ * @vcpu: The VCPU to receive the data abort
* @addr: The address to report in the DFAR
*
* It is assumed that this code is called from the VCPU thread and that the
@@ -125,7 +125,7 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
/**
* kvm_inject_pabt - inject a prefetch abort into the guest
- * @vcpu: The VCPU to receive the undefined exception
+ * @vcpu: The VCPU to receive the prefetch abort
* @addr: The address to report in the DFAR
*
* It is assumed that this code is called from the VCPU thread and that the
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 6fe6ad64cba5..4273e799203d 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -401,7 +401,6 @@ struct kvmppc_mmu {
u32 (*mfsrin)(struct kvm_vcpu *vcpu, u32 srnum);
int (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *pte, bool data, bool iswrite);
- void (*reset_msr)(struct kvm_vcpu *vcpu);
void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large);
int (*esid_to_vsid)(struct kvm_vcpu *vcpu, ulong esid, u64 *vsid);
u64 (*ea_to_vp)(struct kvm_vcpu *vcpu, gva_t eaddr, bool data);
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ee62776e5433..d63f649fe713 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -271,6 +271,7 @@ struct kvmppc_ops {
union kvmppc_one_reg *val);
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
+ void (*inject_interrupt)(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags);
void (*set_msr)(struct kvm_vcpu *vcpu, u64 msr);
int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned int id);
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index b3cbb1136bce..75c7e95a321b 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -748,6 +748,18 @@
#define SPRN_USPRG7 0x107 /* SPRG7 userspace read */
#define SPRN_SRR0 0x01A /* Save/Restore Register 0 */
#define SPRN_SRR1 0x01B /* Save/Restore Register 1 */
+
+#ifdef CONFIG_PPC_BOOK3S
+/*
+ * Bits loaded from MSR upon interrupt.
+ * PPC (64-bit) bits 33-36,42-47 are interrupt dependent, the others are
+ * loaded from MSR. The exception is that SRESET and MCE do not always load
+ * bit 62 (RI) from MSR. Don't use PPC_BITMASK for this because 32-bit uses
+ * it.
+ */
+#define SRR1_MSR_BITS (~0x783f0000UL)
+#endif
+
#define SRR1_ISI_NOPT 0x40000000 /* ISI: Not found in hash */
#define SRR1_ISI_N_OR_G 0x10000000 /* ISI: Access is no-exec or G */
#define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index b0f72dea8b11..264e266a85bf 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -667,6 +667,8 @@ struct kvm_ppc_cpu_char {
/* PPC64 eXternal Interrupt Controller Specification */
#define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */
+#define KVM_DEV_XICS_GRP_CTRL 2
+#define KVM_DEV_XICS_NR_SERVERS 1
/* Layout of 64-bit source attribute values */
#define KVM_XICS_DESTINATION_SHIFT 0
@@ -683,6 +685,7 @@ struct kvm_ppc_cpu_char {
#define KVM_DEV_XIVE_GRP_CTRL 1
#define KVM_DEV_XIVE_RESET 1
#define KVM_DEV_XIVE_EQ_SYNC 2
+#define KVM_DEV_XIVE_NR_SERVERS 3
#define KVM_DEV_XIVE_GRP_SOURCE 2 /* 64-bit source identifier */
#define KVM_DEV_XIVE_GRP_SOURCE_CONFIG 3 /* 64-bit source identifier */
#define KVM_DEV_XIVE_GRP_EQ_CONFIG 4 /* 64-bit EQ identifier */
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index ec2547cc5ecb..58a59ee998e2 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -74,27 +74,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
-void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
- ulong pc = kvmppc_get_pc(vcpu);
- ulong lr = kvmppc_get_lr(vcpu);
- if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
- kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
- if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
- kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK);
- vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
- }
-}
-EXPORT_SYMBOL_GPL(kvmppc_unfixup_split_real);
-
-static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
-{
- if (!is_kvmppc_hv_enabled(vcpu->kvm))
- return to_book3s(vcpu)->hior;
- return 0;
-}
-
static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
unsigned long pending_now, unsigned long old_pending)
{
@@ -134,11 +113,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
{
- kvmppc_unfixup_split_real(vcpu);
- kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
- kvmppc_set_srr1(vcpu, (kvmppc_get_msr(vcpu) & ~0x783f0000ul) | flags);
- kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec);
- vcpu->arch.mmu.reset_msr(vcpu);
+ vcpu->kvm->arch.kvm_ops->inject_interrupt(vcpu, vec, flags);
}
static int kvmppc_book3s_vec2irqprio(unsigned int vec)
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 2ef1311a2a13..3a4613985949 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -32,4 +32,7 @@ extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val);
static inline void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) {}
#endif
+extern void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr);
+extern void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags);
+
#endif
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 18f244aad7aa..f21e73492ce3 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -90,11 +90,6 @@ static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
return (((u64)eaddr >> 12) & 0xffff) | (vsid << 16);
}
-static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu)
-{
- kvmppc_set_msr(vcpu, 0);
-}
-
static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu,
u32 sre, gva_t eaddr,
bool primary)
@@ -406,7 +401,6 @@ void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu)
mmu->mtsrin = kvmppc_mmu_book3s_32_mtsrin;
mmu->mfsrin = kvmppc_mmu_book3s_32_mfsrin;
mmu->xlate = kvmppc_mmu_book3s_32_xlate;
- mmu->reset_msr = kvmppc_mmu_book3s_32_reset_msr;
mmu->tlbie = kvmppc_mmu_book3s_32_tlbie;
mmu->esid_to_vsid = kvmppc_mmu_book3s_32_esid_to_vsid;
mmu->ea_to_vp = kvmppc_mmu_book3s_32_ea_to_vp;
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 5f63a5f7f24f..599133256a95 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -24,20 +24,6 @@
#define dprintk(X...) do { } while(0)
#endif
-static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
-{
- unsigned long msr = vcpu->arch.intr_msr;
- unsigned long cur_msr = kvmppc_get_msr(vcpu);
-
- /* If transactional, change to suspend mode on IRQ delivery */
- if (MSR_TM_TRANSACTIONAL(cur_msr))
- msr |= MSR_TS_S;
- else
- msr |= cur_msr & MSR_TS_MASK;
-
- kvmppc_set_msr(vcpu, msr);
-}
-
static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
struct kvm_vcpu *vcpu,
gva_t eaddr)
@@ -676,7 +662,6 @@ void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu)
mmu->slbie = kvmppc_mmu_book3s_64_slbie;
mmu->slbia = kvmppc_mmu_book3s_64_slbia;
mmu->xlate = kvmppc_mmu_book3s_64_xlate;
- mmu->reset_msr = kvmppc_mmu_book3s_64_reset_msr;
mmu->tlbie = kvmppc_mmu_book3s_64_tlbie;
mmu->esid_to_vsid = kvmppc_mmu_book3s_64_esid_to_vsid;
mmu->ea_to_vp = kvmppc_mmu_book3s_64_ea_to_vp;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 9a75f0e1933b..d381526c5c9b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -275,18 +275,6 @@ int kvmppc_mmu_hv_init(void)
return 0;
}
-static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
-{
- unsigned long msr = vcpu->arch.intr_msr;
-
- /* If transactional, change to suspend mode on IRQ delivery */
- if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
- msr |= MSR_TS_S;
- else
- msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
- kvmppc_set_msr(vcpu, msr);
-}
-
static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh,
unsigned long ptel, unsigned long *pte_idx_ret)
@@ -508,6 +496,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct vm_area_struct *vma;
unsigned long rcbits;
long mmio_update;
+ struct mm_struct *mm;
if (kvm_is_radix(kvm))
return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
@@ -584,6 +573,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
is_ci = false;
pfn = 0;
page = NULL;
+ mm = current->mm;
pte_size = PAGE_SIZE;
writing = (dsisr & DSISR_ISSTORE) != 0;
/* If writing != 0, then the HPTE must allow writing, if we get here */
@@ -592,8 +582,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages);
if (npages < 1) {
/* Check if it's an I/O mapping */
- down_read(&current->mm->mmap_sem);
- vma = find_vma(current->mm, hva);
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, hva);
if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
(vma->vm_flags & VM_PFNMAP)) {
pfn = vma->vm_pgoff +
@@ -602,7 +592,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot))));
write_ok = vma->vm_flags & VM_WRITE;
}
- up_read(&current->mm->mmap_sem);
+ up_read(&mm->mmap_sem);
if (!pfn)
goto out_put;
} else {
@@ -621,8 +611,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
* hugepage split and collapse.
*/
local_irq_save(flags);
- ptep = find_current_mm_pte(current->mm->pgd,
- hva, NULL, NULL);
+ ptep = find_current_mm_pte(mm->pgd, hva, NULL, NULL);
if (ptep) {
pte = kvmppc_read_update_linux_pte(ptep, 1);
if (__pte_write(pte))
@@ -2000,7 +1989,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
if (ret < 0) {
kfree(ctx);
- kvm_put_kvm(kvm);
+ kvm_put_kvm_no_destroy(kvm);
return ret;
}
@@ -2161,7 +2150,6 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
- mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
}
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 5834db0a54c6..883a66e76638 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -317,7 +317,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
if (ret >= 0)
list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
else
- kvm_put_kvm(kvm);
+ kvm_put_kvm_no_destroy(kvm);
mutex_unlock(&kvm->lock);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 709cf1fd4cf4..ec5c0379296a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -133,7 +133,6 @@ static inline bool nesting_enabled(struct kvm *kvm)
/* If set, the threads on each CPU core have to be in the same MMU mode */
static bool no_mixing_hpt_and_radix;
-static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
/*
@@ -338,18 +337,6 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
}
-static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
-{
- /*
- * Check for illegal transactional state bit combination
- * and if we find it, force the TS field to a safe state.
- */
- if ((msr & MSR_TS_MASK) == MSR_TS_MASK)
- msr &= ~MSR_TS_MASK;
- vcpu->arch.shregs.msr = msr;
- kvmppc_end_cede(vcpu);
-}
-
static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
{
vcpu->arch.pvr = pvr;
@@ -792,6 +779,11 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
vcpu->arch.dawr = value1;
vcpu->arch.dawrx = value2;
return H_SUCCESS;
+ case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
+ /* KVM does not support mflags=2 (AIL=2) */
+ if (mflags != 0 && mflags != 3)
+ return H_UNSUPPORTED_FLAG_START;
+ return H_TOO_HARD;
default:
return H_TOO_HARD;
}
@@ -2454,15 +2446,6 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
vcpu->arch.timer_running = 1;
}
-static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.ceded = 0;
- if (vcpu->arch.timer_running) {
- hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
- vcpu->arch.timer_running = 0;
- }
-}
-
extern int __kvmppc_vcore_entry(void);
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
@@ -5401,6 +5384,7 @@ static struct kvmppc_ops kvm_ops_hv = {
.set_one_reg = kvmppc_set_one_reg_hv,
.vcpu_load = kvmppc_core_vcpu_load_hv,
.vcpu_put = kvmppc_core_vcpu_put_hv,
+ .inject_interrupt = kvmppc_inject_interrupt_hv,
.set_msr = kvmppc_set_msr_hv,
.vcpu_run = kvmppc_vcpu_run_hv,
.vcpu_create = kvmppc_core_vcpu_create_hv,
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 7c1909657b55..7cd3cf3d366b 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -755,6 +755,71 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
local_paca->kvm_hstate.kvm_split_mode = NULL;
}
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.ceded = 0;
+ if (vcpu->arch.timer_running) {
+ hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+ vcpu->arch.timer_running = 0;
+ }
+}
+
+void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
+{
+ /*
+ * Check for illegal transactional state bit combination
+ * and if we find it, force the TS field to a safe state.
+ */
+ if ((msr & MSR_TS_MASK) == MSR_TS_MASK)
+ msr &= ~MSR_TS_MASK;
+ vcpu->arch.shregs.msr = msr;
+ kvmppc_end_cede(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvmppc_set_msr_hv);
+
+static void inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags)
+{
+ unsigned long msr, pc, new_msr, new_pc;
+
+ msr = kvmppc_get_msr(vcpu);
+ pc = kvmppc_get_pc(vcpu);
+ new_msr = vcpu->arch.intr_msr;
+ new_pc = vec;
+
+ /* If transactional, change to suspend mode on IRQ delivery */
+ if (MSR_TM_TRANSACTIONAL(msr))
+ new_msr |= MSR_TS_S;
+ else
+ new_msr |= msr & MSR_TS_MASK;
+
+ /*
+ * Perform MSR and PC adjustment for LPCR[AIL]=3 if it is set and
+ * applicable. AIL=2 is not supported.
+ *
+ * AIL does not apply to SRESET, MCE, or HMI (which is never
+ * delivered to the guest), and does not apply if IR=0 or DR=0.
+ */
+ if (vec != BOOK3S_INTERRUPT_SYSTEM_RESET &&
+ vec != BOOK3S_INTERRUPT_MACHINE_CHECK &&
+ (vcpu->arch.vcore->lpcr & LPCR_AIL) == LPCR_AIL_3 &&
+ (msr & (MSR_IR|MSR_DR)) == (MSR_IR|MSR_DR) ) {
+ new_msr |= MSR_IR | MSR_DR;
+ new_pc += 0xC000000000004000ULL;
+ }
+
+ kvmppc_set_srr0(vcpu, pc);
+ kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags);
+ kvmppc_set_pc(vcpu, new_pc);
+ vcpu->arch.shregs.msr = new_msr;
+}
+
+void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags)
+{
+ inject_interrupt(vcpu, vec, srr1_flags);
+ kvmppc_end_cede(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvmppc_inject_interrupt_hv);
+
/*
* Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
* Can we inject a Decrementer or a External interrupt?
@@ -762,7 +827,6 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
{
int ext;
- unsigned long vec = 0;
unsigned long lpcr;
/* Insert EXTERNAL bit into LPCR at the MER bit position */
@@ -774,26 +838,16 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
if (vcpu->arch.shregs.msr & MSR_EE) {
if (ext) {
- vec = BOOK3S_INTERRUPT_EXTERNAL;
+ inject_interrupt(vcpu, BOOK3S_INTERRUPT_EXTERNAL, 0);
} else {
long int dec = mfspr(SPRN_DEC);
if (!(lpcr & LPCR_LD))
dec = (int) dec;
if (dec < 0)
- vec = BOOK3S_INTERRUPT_DECREMENTER;
+ inject_interrupt(vcpu,
+ BOOK3S_INTERRUPT_DECREMENTER, 0);
}
}
- if (vec) {
- unsigned long msr, old_msr = vcpu->arch.shregs.msr;
-
- kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
- kvmppc_set_srr1(vcpu, old_msr);
- kvmppc_set_pc(vcpu, vec);
- msr = vcpu->arch.intr_msr;
- if (MSR_TM_ACTIVE(old_msr))
- msr |= MSR_TS_S;
- vcpu->arch.shregs.msr = msr;
- }
if (vcpu->arch.doorbell_request) {
mtspr(SPRN_DPDES, 1);
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index cdf30c6eaf54..dc97e5be76f6 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -1186,7 +1186,7 @@ static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
forward_to_l1:
vcpu->arch.fault_dsisr = flags;
if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
- vcpu->arch.shregs.msr &= ~0x783f0000ul;
+ vcpu->arch.shregs.msr &= SRR1_MSR_BITS;
vcpu->arch.shregs.msr |= flags;
}
return RESUME_HOST;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index cc65af8fe6f7..ce4fcf76e53e 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -90,7 +90,43 @@ static void kvmppc_fixup_split_real(struct kvm_vcpu *vcpu)
kvmppc_set_pc(vcpu, pc | SPLIT_HACK_OFFS);
}
-void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu);
+static void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
+ ulong pc = kvmppc_get_pc(vcpu);
+ ulong lr = kvmppc_get_lr(vcpu);
+ if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
+ kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
+ if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
+ kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK);
+ vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
+ }
+}
+
+static void kvmppc_inject_interrupt_pr(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags)
+{
+ unsigned long msr, pc, new_msr, new_pc;
+
+ kvmppc_unfixup_split_real(vcpu);
+
+ msr = kvmppc_get_msr(vcpu);
+ pc = kvmppc_get_pc(vcpu);
+ new_msr = vcpu->arch.intr_msr;
+ new_pc = to_book3s(vcpu)->hior + vec;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ /* If transactional, change to suspend mode on IRQ delivery */
+ if (MSR_TM_TRANSACTIONAL(msr))
+ new_msr |= MSR_TS_S;
+ else
+ new_msr |= msr & MSR_TS_MASK;
+#endif
+
+ kvmppc_set_srr0(vcpu, pc);
+ kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags);
+ kvmppc_set_pc(vcpu, new_pc);
+ kvmppc_set_msr(vcpu, new_msr);
+}
static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu)
{
@@ -1761,6 +1797,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm,
#else
/* default to book3s_32 (750) */
vcpu->arch.pvr = 0x84202;
+ vcpu->arch.intr_msr = 0;
#endif
kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr);
vcpu->arch.slb_nr = 64;
@@ -2058,6 +2095,7 @@ static struct kvmppc_ops kvm_ops_pr = {
.set_one_reg = kvmppc_set_one_reg_pr,
.vcpu_load = kvmppc_core_vcpu_load_pr,
.vcpu_put = kvmppc_core_vcpu_put_pr,
+ .inject_interrupt = kvmppc_inject_interrupt_pr,
.set_msr = kvmppc_set_msr_pr,
.vcpu_run = kvmppc_vcpu_run_pr,
.vcpu_create = kvmppc_core_vcpu_create_pr,
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index a3f9c665bb5b..66858b7d3c6b 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1211,6 +1211,45 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.xive_vcpu = NULL;
}
+static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu)
+{
+ /* We have a block of xive->nr_servers VPs. We just need to check
+ * raw vCPU ids are below the expected limit for this guest's
+ * core stride ; kvmppc_pack_vcpu_id() will pack them down to an
+ * index that can be safely used to compute a VP id that belongs
+ * to the VP block.
+ */
+ return cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode;
+}
+
+int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp)
+{
+ u32 vp_id;
+
+ if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) {
+ pr_devel("Out of bounds !\n");
+ return -EINVAL;
+ }
+
+ if (xive->vp_base == XIVE_INVALID_VP) {
+ xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers);
+ pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers);
+
+ if (xive->vp_base == XIVE_INVALID_VP)
+ return -ENOSPC;
+ }
+
+ vp_id = kvmppc_xive_vp(xive, cpu);
+ if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
+ pr_devel("Duplicate !\n");
+ return -EEXIST;
+ }
+
+ *vp = vp_id;
+
+ return 0;
+}
+
int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu)
{
@@ -1229,20 +1268,13 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
return -EPERM;
if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
return -EBUSY;
- if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
- pr_devel("Out of bounds !\n");
- return -EINVAL;
- }
/* We need to synchronize with queue provisioning */
mutex_lock(&xive->lock);
- vp_id = kvmppc_xive_vp(xive, cpu);
- if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
- pr_devel("Duplicate !\n");
- r = -EEXIST;
+ r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id);
+ if (r)
goto bail;
- }
xc = kzalloc(sizeof(*xc), GFP_KERNEL);
if (!xc) {
@@ -1834,6 +1866,43 @@ int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
return 0;
}
+int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr)
+{
+ u32 __user *ubufp = (u32 __user *) addr;
+ u32 nr_servers;
+ int rc = 0;
+
+ if (get_user(nr_servers, ubufp))
+ return -EFAULT;
+
+ pr_devel("%s nr_servers=%u\n", __func__, nr_servers);
+
+ if (!nr_servers || nr_servers > KVM_MAX_VCPU_ID)
+ return -EINVAL;
+
+ mutex_lock(&xive->lock);
+ if (xive->vp_base != XIVE_INVALID_VP)
+ /* The VP block is allocated once and freed when the device
+ * is released. Better not allow to change its size since its
+ * used by connect_vcpu to validate vCPU ids are valid (eg,
+ * setting it back to a higher value could allow connect_vcpu
+ * to come up with a VP id that goes beyond the VP block, which
+ * is likely to cause a crash in OPAL).
+ */
+ rc = -EBUSY;
+ else if (nr_servers > KVM_MAX_VCPUS)
+ /* We don't need more servers. Higher vCPU ids get packed
+ * down below KVM_MAX_VCPUS by kvmppc_pack_vcpu_id().
+ */
+ xive->nr_servers = KVM_MAX_VCPUS;
+ else
+ xive->nr_servers = nr_servers;
+
+ mutex_unlock(&xive->lock);
+
+ return rc;
+}
+
static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
{
struct kvmppc_xive *xive = dev->private;
@@ -1842,6 +1911,11 @@ static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
switch (attr->group) {
case KVM_DEV_XICS_GRP_SOURCES:
return xive_set_source(xive, attr->attr, attr->addr);
+ case KVM_DEV_XICS_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_XICS_NR_SERVERS:
+ return kvmppc_xive_set_nr_servers(xive, attr->addr);
+ }
}
return -ENXIO;
}
@@ -1867,6 +1941,11 @@ static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
attr->attr < KVMPPC_XICS_NR_IRQS)
return 0;
break;
+ case KVM_DEV_XICS_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_XICS_NR_SERVERS:
+ return 0;
+ }
}
return -ENXIO;
}
@@ -2001,10 +2080,13 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
{
struct kvmppc_xive *xive;
struct kvm *kvm = dev->kvm;
- int ret = 0;
pr_devel("Creating xive for partition\n");
+ /* Already there ? */
+ if (kvm->arch.xive)
+ return -EEXIST;
+
xive = kvmppc_xive_get_device(kvm, type);
if (!xive)
return -ENOMEM;
@@ -2014,12 +2096,6 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
xive->kvm = kvm;
mutex_init(&xive->lock);
- /* Already there ? */
- if (kvm->arch.xive)
- ret = -EEXIST;
- else
- kvm->arch.xive = xive;
-
/* We use the default queue size set by the host */
xive->q_order = xive_native_default_eq_shift();
if (xive->q_order < PAGE_SHIFT)
@@ -2027,18 +2103,16 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
else
xive->q_page_order = xive->q_order - PAGE_SHIFT;
- /* Allocate a bunch of VPs */
- xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
- pr_devel("VP_Base=%x\n", xive->vp_base);
-
- if (xive->vp_base == XIVE_INVALID_VP)
- ret = -ENOMEM;
+ /* VP allocation is delayed to the first call to connect_vcpu */
+ xive->vp_base = XIVE_INVALID_VP;
+ /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
+ * on a POWER9 system.
+ */
+ xive->nr_servers = KVM_MAX_VCPUS;
xive->single_escalation = xive_native_has_single_escalation();
- if (ret)
- return ret;
-
+ kvm->arch.xive = xive;
return 0;
}
@@ -2108,9 +2182,9 @@ static int xive_debug_show(struct seq_file *m, void *private)
if (!xc)
continue;
- seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x"
+ seq_printf(m, "cpu server %#x VP:%#x CPPR:%#x HWCPPR:%#x"
" MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
- xc->server_num, xc->cppr, xc->hw_cppr,
+ xc->server_num, xc->vp_id, xc->cppr, xc->hw_cppr,
xc->mfrr, xc->pending,
xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index fe3ed50e0818..382e3a56e789 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -135,6 +135,9 @@ struct kvmppc_xive {
/* Flags */
u8 single_escalation;
+ /* Number of entries in the VP block */
+ u32 nr_servers;
+
struct kvmppc_xive_ops *ops;
struct address_space *mapping;
struct mutex mapping_lock;
@@ -296,6 +299,8 @@ int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
struct kvmppc_xive_vcpu *xc, int irq);
+int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp);
+int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr);
#endif /* CONFIG_KVM_XICS */
#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 78b906ffa0d2..d83adb1e1490 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -50,6 +50,24 @@ static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
}
}
+static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q,
+ u8 prio, __be32 *qpage,
+ u32 order, bool can_escalate)
+{
+ int rc;
+ __be32 *qpage_prev = q->qpage;
+
+ rc = xive_native_configure_queue(vp_id, q, prio, qpage, order,
+ can_escalate);
+ if (rc)
+ return rc;
+
+ if (qpage_prev)
+ put_page(virt_to_page(qpage_prev));
+
+ return rc;
+}
+
void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@@ -118,19 +136,12 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
return -EPERM;
if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
return -EBUSY;
- if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
- pr_devel("Out of bounds !\n");
- return -EINVAL;
- }
mutex_lock(&xive->lock);
- vp_id = kvmppc_xive_vp(xive, server_num);
- if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
- pr_devel("Duplicate !\n");
- rc = -EEXIST;
+ rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id);
+ if (rc)
goto bail;
- }
xc = kzalloc(sizeof(*xc), GFP_KERNEL);
if (!xc) {
@@ -582,19 +593,14 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
q->guest_qaddr = 0;
q->guest_qshift = 0;
- rc = xive_native_configure_queue(xc->vp_id, q, priority,
- NULL, 0, true);
+ rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
+ NULL, 0, true);
if (rc) {
pr_err("Failed to reset queue %d for VCPU %d: %d\n",
priority, xc->server_num, rc);
return rc;
}
- if (q->qpage) {
- put_page(virt_to_page(q->qpage));
- q->qpage = NULL;
- }
-
return 0;
}
@@ -624,12 +630,6 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
srcu_idx = srcu_read_lock(&kvm->srcu);
gfn = gpa_to_gfn(kvm_eq.qaddr);
- page = gfn_to_page(kvm, gfn);
- if (is_error_page(page)) {
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
- return -EINVAL;
- }
page_size = kvm_host_page_size(kvm, gfn);
if (1ull << kvm_eq.qshift > page_size) {
@@ -638,6 +638,13 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
return -EINVAL;
}
+ page = gfn_to_page(kvm, gfn);
+ if (is_error_page(page)) {
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
+ return -EINVAL;
+ }
+
qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
srcu_read_unlock(&kvm->srcu, srcu_idx);
@@ -653,8 +660,8 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
* OPAL level because the use of END ESBs is not supported by
* Linux.
*/
- rc = xive_native_configure_queue(xc->vp_id, q, priority,
- (__be32 *) qaddr, kvm_eq.qshift, true);
+ rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority,
+ (__be32 *) qaddr, kvm_eq.qshift, true);
if (rc) {
pr_err("Failed to configure queue %d for VCPU %d: %d\n",
priority, xc->server_num, rc);
@@ -928,6 +935,8 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
return kvmppc_xive_reset(xive);
case KVM_DEV_XIVE_EQ_SYNC:
return kvmppc_xive_native_eq_sync(xive);
+ case KVM_DEV_XIVE_NR_SERVERS:
+ return kvmppc_xive_set_nr_servers(xive, attr->addr);
}
break;
case KVM_DEV_XIVE_GRP_SOURCE:
@@ -967,6 +976,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
switch (attr->attr) {
case KVM_DEV_XIVE_RESET:
case KVM_DEV_XIVE_EQ_SYNC:
+ case KVM_DEV_XIVE_NR_SERVERS:
return 0;
}
break;
@@ -1067,7 +1077,6 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
{
struct kvmppc_xive *xive;
struct kvm *kvm = dev->kvm;
- int ret = 0;
pr_devel("Creating xive native device\n");
@@ -1081,27 +1090,20 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
dev->private = xive;
xive->dev = dev;
xive->kvm = kvm;
- kvm->arch.xive = xive;
mutex_init(&xive->mapping_lock);
mutex_init(&xive->lock);
- /*
- * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for
- * a default. Getting the max number of CPUs the VM was
- * configured with would improve our usage of the XIVE VP space.
+ /* VP allocation is delayed to the first call to connect_vcpu */
+ xive->vp_base = XIVE_INVALID_VP;
+ /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
+ * on a POWER9 system.
*/
- xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
- pr_devel("VP_Base=%x\n", xive->vp_base);
-
- if (xive->vp_base == XIVE_INVALID_VP)
- ret = -ENXIO;
+ xive->nr_servers = KVM_MAX_VCPUS;
xive->single_escalation = xive_native_has_single_escalation();
xive->ops = &kvmppc_xive_native_ops;
- if (ret)
- return ret;
-
+ kvm->arch.xive = xive;
return 0;
}
@@ -1204,8 +1206,8 @@ static int xive_native_debug_show(struct seq_file *m, void *private)
if (!xc)
continue;
- seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
- xc->server_num,
+ seq_printf(m, "cpu server %#x VP=%#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
+ xc->server_num, xc->vp_id,
vcpu->arch.xive_saved_state.nsr,
vcpu->arch.xive_saved_state.cppr,
vcpu->arch.xive_saved_state.ipb,
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 321db0fdb9db..425d13806645 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -355,9 +355,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
if (tlbsel == 1) {
struct vm_area_struct *vma;
- down_read(&current->mm->mmap_sem);
+ down_read(&kvm->mm->mmap_sem);
- vma = find_vma(current->mm, hva);
+ vma = find_vma(kvm->mm, hva);
if (vma && hva >= vma->vm_start &&
(vma->vm_flags & VM_PFNMAP)) {
/*
@@ -441,7 +441,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
}
- up_read(&current->mm->mmap_sem);
+ up_read(&kvm->mm->mmap_sem);
}
if (likely(!pfnmap)) {
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 3a77bb643452..9e085e931d74 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -522,6 +522,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_IMMEDIATE_EXIT:
r = 1;
break;
+ case KVM_CAP_PPC_GUEST_DEBUG_SSTEP:
+ /* fall through */
case KVM_CAP_PPC_PAIRED_SINGLES:
case KVM_CAP_PPC_OSI:
case KVM_CAP_PPC_GET_PVINFO:
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index abe60268335d..02f4c21c57f6 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -392,6 +392,7 @@ struct kvm_vcpu_stat {
u64 diagnose_10;
u64 diagnose_44;
u64 diagnose_9c;
+ u64 diagnose_9c_ignored;
u64 diagnose_258;
u64 diagnose_308;
u64 diagnose_500;
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 45634b3d2e0a..3fb54ec2cf3e 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -158,14 +158,28 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
vcpu->stat.diagnose_9c++;
- VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d", tid);
+ /* yield to self */
if (tid == vcpu->vcpu_id)
- return 0;
+ goto no_yield;
+ /* yield to invalid */
tcpu = kvm_get_vcpu_by_id(vcpu->kvm, tid);
- if (tcpu)
- kvm_vcpu_yield_to(tcpu);
+ if (!tcpu)
+ goto no_yield;
+
+ /* target already running */
+ if (READ_ONCE(tcpu->cpu) >= 0)
+ goto no_yield;
+
+ if (kvm_vcpu_yield_to(tcpu) <= 0)
+ goto no_yield;
+
+ VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: done", tid);
+ return 0;
+no_yield:
+ VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: ignored", tid);
+ vcpu->stat.diagnose_9c_ignored++;
return 0;
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index d1ccc168c071..165dea4c7f19 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1477,8 +1477,7 @@ static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
return 0;
}
-static int __inject_sigp_restart(struct kvm_vcpu *vcpu,
- struct kvm_s390_irq *irq)
+static int __inject_sigp_restart(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -2007,7 +2006,7 @@ static int do_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
rc = __inject_sigp_stop(vcpu, irq);
break;
case KVM_S390_RESTART:
- rc = __inject_sigp_restart(vcpu, irq);
+ rc = __inject_sigp_restart(vcpu);
break;
case KVM_S390_INT_CLOCK_COMP:
rc = __inject_ckc(vcpu);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d047e846e1b9..d9e6bf3d54f0 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -155,6 +155,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "instruction_diag_10", VCPU_STAT(diagnose_10) },
{ "instruction_diag_44", VCPU_STAT(diagnose_44) },
{ "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
+ { "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) },
{ "instruction_diag_258", VCPU_STAT(diagnose_258) },
{ "instruction_diag_308", VCPU_STAT(diagnose_308) },
{ "instruction_diag_500", VCPU_STAT(diagnose_500) },
@@ -453,16 +454,14 @@ static void kvm_s390_cpu_feat_init(void)
int kvm_arch_init(void *opaque)
{
- int rc;
+ int rc = -ENOMEM;
kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
if (!kvm_s390_dbf)
return -ENOMEM;
- if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) {
- rc = -ENOMEM;
- goto out_debug_unreg;
- }
+ if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view))
+ goto out;
kvm_s390_cpu_feat_init();
@@ -470,19 +469,17 @@ int kvm_arch_init(void *opaque)
rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
if (rc) {
pr_err("A FLIC registration call failed with rc=%d\n", rc);
- goto out_debug_unreg;
+ goto out;
}
rc = kvm_s390_gib_init(GAL_ISC);
if (rc)
- goto out_gib_destroy;
+ goto out;
return 0;
-out_gib_destroy:
- kvm_s390_gib_destroy();
-out_debug_unreg:
- debug_unregister(kvm_s390_dbf);
+out:
+ kvm_arch_exit();
return rc;
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fcef678c3423..937363b803c1 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3323,8 +3323,19 @@ static int intel_pmu_hw_config(struct perf_event *event)
return 0;
}
+#ifdef CONFIG_RETPOLINE
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr);
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr);
+#endif
+
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
{
+#ifdef CONFIG_RETPOLINE
+ if (x86_pmu.guest_get_msrs == intel_guest_get_msrs)
+ return intel_guest_get_msrs(nr);
+ else if (x86_pmu.guest_get_msrs == core_guest_get_msrs)
+ return core_guest_get_msrs(nr);
+#endif
if (x86_pmu.guest_get_msrs)
return x86_pmu.guest_get_msrs(nr);
*nr = 0;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4fc61483919a..b79cd6aa4075 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -156,10 +156,8 @@ enum kvm_reg {
VCPU_REGS_R15 = __VCPU_REGS_R15,
#endif
VCPU_REGS_RIP,
- NR_VCPU_REGS
-};
+ NR_VCPU_REGS,
-enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
VCPU_EXREG_CR3,
VCPU_EXREG_RFLAGS,
@@ -454,6 +452,11 @@ struct kvm_pmc {
u64 eventsel;
struct perf_event *perf_event;
struct kvm_vcpu *vcpu;
+ /*
+ * eventsel value for general purpose counters,
+ * ctrl value for fixed counters.
+ */
+ u64 current_config;
};
struct kvm_pmu {
@@ -472,7 +475,21 @@ struct kvm_pmu {
struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
struct irq_work irq_work;
- u64 reprogram_pmi;
+ DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
+ DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
+ DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
+
+ /*
+ * The gate to release perf_events not marked in
+ * pmc_in_use only once in a vcpu time slice.
+ */
+ bool need_cleanup;
+
+ /*
+ * The total number of programmed perf_events and it helps to avoid
+ * redundant check before cleanup if guest don't use vPMU at all.
+ */
+ u8 event_count;
};
struct kvm_pmu_ops;
@@ -565,6 +582,7 @@ struct kvm_vcpu_arch {
u64 smbase;
u64 smi_count;
bool tpr_access_reporting;
+ bool xsaves_enabled;
u64 ia32_xss;
u64 microcode_version;
u64 arch_capabilities;
@@ -1041,7 +1059,6 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
- void (*decache_cr3)(struct kvm_vcpu *vcpu);
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -1090,7 +1107,7 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- bool (*get_enable_apicv)(struct kvm_vcpu *vcpu);
+ bool (*get_enable_apicv)(struct kvm *kvm);
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
@@ -1357,6 +1374,7 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
+int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
@@ -1577,6 +1595,8 @@ bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
void kvm_make_mclock_inprogress_request(struct kvm *kvm);
void kvm_make_scan_ioapic_request(struct kvm *kvm);
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+ unsigned long *vcpu_bitmap);
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e820568ed4d5..32ef1ee733b7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -33,6 +33,7 @@
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
#include <asm/tlb.h>
+#include <asm/cpuidle_haltpoll.h>
static int kvmapf = 1;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31ecf7a76d5a..b19ef421084d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -8,9 +8,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
-kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
+kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- hyperv.o page_track.o debugfs.o
+ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o
kvm-amd-y += svm.o pmu_amd.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f68c0c753c38..c0aa07487eb8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -816,8 +816,6 @@ static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func,
return __do_cpuid_func(entry, func, nent, maxnent);
}
-#undef F
-
struct kvm_cpuid_param {
u32 func;
bool (*qualifier)(const struct kvm_cpuid_param *param);
@@ -1015,6 +1013,12 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
*ebx = entry->ebx;
*ecx = entry->ecx;
*edx = entry->edx;
+ if (function == 7 && index == 0) {
+ u64 data;
+ if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
+ (data & TSX_CTRL_CPUID_CLEAR))
+ *ebx &= ~(F(RTM) | F(HLE));
+ }
} else {
*eax = *ebx = *ecx = *edx = 0;
/*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 698efb8c3897..952d1a4f4d7e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2770,11 +2770,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
return emulate_ud(ctxt);
ops->get_msr(ctxt, MSR_EFER, &efer);
- setup_syscalls_segments(ctxt, &cs, &ss);
-
if (!(efer & EFER_SCE))
return emulate_ud(ctxt);
+ setup_syscalls_segments(ctxt, &cs, &ss);
ops->get_msr(ctxt, MSR_STAR, &msr_data);
msr_data >>= 32;
cs_sel = (u16)(msr_data & 0xfffc);
@@ -2838,12 +2837,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
if (ctxt->mode == X86EMUL_MODE_PROT64)
return X86EMUL_UNHANDLEABLE;
- setup_syscalls_segments(ctxt, &cs, &ss);
-
ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
+ setup_syscalls_segments(ctxt, &cs, &ss);
ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
ss_sel = cs_sel + 8;
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index d859ae8890d0..9fd2dd89a1c5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -271,8 +271,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
bool mask_before, mask_after;
- int old_remote_irr, old_delivery_status;
union kvm_ioapic_redirect_entry *e;
+ unsigned long vcpu_bitmap;
+ int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode;
switch (ioapic->ioregsel) {
case IOAPIC_REG_VERSION:
@@ -296,6 +297,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
/* Preserve read-only fields */
old_remote_irr = e->fields.remote_irr;
old_delivery_status = e->fields.delivery_status;
+ old_dest_id = e->fields.dest_id;
+ old_dest_mode = e->fields.dest_mode;
if (ioapic->ioregsel & 1) {
e->bits &= 0xffffffff;
e->bits |= (u64) val << 32;
@@ -321,7 +324,34 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
&& ioapic->irr & (1 << index))
ioapic_service(ioapic, index, false);
- kvm_make_scan_ioapic_request(ioapic->kvm);
+ if (e->fields.delivery_mode == APIC_DM_FIXED) {
+ struct kvm_lapic_irq irq;
+
+ irq.shorthand = 0;
+ irq.vector = e->fields.vector;
+ irq.delivery_mode = e->fields.delivery_mode << 8;
+ irq.dest_id = e->fields.dest_id;
+ irq.dest_mode = e->fields.dest_mode;
+ bitmap_zero(&vcpu_bitmap, 16);
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ &vcpu_bitmap);
+ if (old_dest_mode != e->fields.dest_mode ||
+ old_dest_id != e->fields.dest_id) {
+ /*
+ * Update vcpu_bitmap with vcpus specified in
+ * the previous request as well. This is done to
+ * keep ioapic_handled_vectors synchronized.
+ */
+ irq.dest_id = old_dest_id;
+ irq.dest_mode = old_dest_mode;
+ kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
+ &vcpu_bitmap);
+ }
+ kvm_make_scan_ioapic_request_mask(ioapic->kvm,
+ &vcpu_bitmap);
+ } else {
+ kvm_make_scan_ioapic_request(ioapic->kvm);
+ }
break;
}
}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1cc6c47dc77e..58767020de41 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -37,22 +37,50 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14)
BUILD_KVM_GPR_ACCESSORS(r15, R15)
#endif
-static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
{
- if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
+{
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return 0;
+
+ if (!kvm_register_is_available(vcpu, reg))
kvm_x86_ops->cache_reg(vcpu, reg);
return vcpu->arch.regs[reg];
}
-static inline void kvm_register_write(struct kvm_vcpu *vcpu,
- enum kvm_reg reg,
+static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg,
unsigned long val)
{
+ if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS))
+ return;
+
vcpu->arch.regs[reg] = val;
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ kvm_register_mark_dirty(vcpu, reg);
}
static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
@@ -79,9 +107,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
might_sleep(); /* on svm */
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail))
- kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
+ kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -109,8 +136,8 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- kvm_x86_ops->decache_cr3(vcpu);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_CR3);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b29d00b661ff..cf9177b4a07f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -557,60 +557,53 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
irq->level, irq->trig_mode, dest_map);
}
+static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
+ struct kvm_lapic_irq *irq, u32 min)
+{
+ int i, count = 0;
+ struct kvm_vcpu *vcpu;
+
+ if (min > map->max_apic_id)
+ return 0;
+
+ for_each_set_bit(i, ipi_bitmap,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, irq, NULL);
+ }
+ }
+
+ return count;
+}
+
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit)
{
- int i;
struct kvm_apic_map *map;
- struct kvm_vcpu *vcpu;
struct kvm_lapic_irq irq = {0};
int cluster_size = op_64_bit ? 64 : 32;
- int count = 0;
+ int count;
+
+ if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
+ return -KVM_EINVAL;
irq.vector = icr & APIC_VECTOR_MASK;
irq.delivery_mode = icr & APIC_MODE_MASK;
irq.level = (icr & APIC_INT_ASSERT) != 0;
irq.trig_mode = icr & APIC_INT_LEVELTRIG;
- if (icr & APIC_DEST_MASK)
- return -KVM_EINVAL;
- if (icr & APIC_SHORT_MASK)
- return -KVM_EINVAL;
-
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
- if (unlikely(!map)) {
- count = -EOPNOTSUPP;
- goto out;
- }
-
- if (min > map->max_apic_id)
- goto out;
- /* Bits above cluster_size are masked in the caller. */
- for_each_set_bit(i, &ipi_bitmap_low,
- min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
- if (map->phys_map[min + i]) {
- vcpu = map->phys_map[min + i]->vcpu;
- count += kvm_apic_set_irq(vcpu, &irq, NULL);
- }
+ count = -EOPNOTSUPP;
+ if (likely(map)) {
+ count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min);
+ min += cluster_size;
+ count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min);
}
- min += cluster_size;
-
- if (min > map->max_apic_id)
- goto out;
-
- for_each_set_bit(i, &ipi_bitmap_high,
- min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
- if (map->phys_map[min + i]) {
- vcpu = map->phys_map[min + i]->vcpu;
- count += kvm_apic_set_irq(vcpu, &irq, NULL);
- }
- }
-
-out:
rcu_read_unlock();
return count;
}
@@ -1124,6 +1117,50 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
return result;
}
+/*
+ * This routine identifies the destination vcpus mask meant to receive the
+ * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find
+ * out the destination vcpus array and set the bitmap or it traverses to
+ * each available vcpu to identify the same.
+ */
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap)
+{
+ struct kvm_lapic **dest_vcpu = NULL;
+ struct kvm_lapic *src = NULL;
+ struct kvm_apic_map *map;
+ struct kvm_vcpu *vcpu;
+ unsigned long bitmap;
+ int i, vcpu_idx;
+ bool ret;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu,
+ &bitmap);
+ if (ret) {
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dest_vcpu[i])
+ continue;
+ vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx;
+ __set_bit(vcpu_idx, vcpu_bitmap);
+ }
+ } else {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+ if (!kvm_apic_match_dest(vcpu, NULL,
+ irq->delivery_mode,
+ irq->dest_id,
+ irq->dest_mode))
+ continue;
+ __set_bit(i, vcpu_bitmap);
+ }
+ }
+ rcu_read_unlock();
+}
+
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
{
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
@@ -2709,7 +2746,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
* KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs
* and leave the INIT pending.
*/
- if (is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu)) {
+ if (kvm_vcpu_latch_init(vcpu)) {
WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1f5014852e20..39925afdfcdc 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -226,6 +226,9 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu);
+void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ unsigned long *vcpu_bitmap);
+
bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2ce9da58611e..6f92b40d798c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4395,7 +4395,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
if (!skip_tlb_flush) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- kvm_x86_ops->tlb_flush(vcpu, true);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
/*
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 3521e2d176f2..3521e2d176f2 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 97b21e7fd013..97b21e7fd013 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 46875bbd0419..d5e6d5b3f06f 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -62,8 +62,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- if (!test_and_set_bit(pmc->idx,
- (unsigned long *)&pmu->reprogram_pmi)) {
+ if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
@@ -76,8 +75,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- if (!test_and_set_bit(pmc->idx,
- (unsigned long *)&pmu->reprogram_pmi)) {
+ if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
@@ -137,7 +135,37 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
}
pmc->perf_event = event;
- clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
+ pmc_to_pmu(pmc)->event_count++;
+ clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+}
+
+static void pmc_pause_counter(struct kvm_pmc *pmc)
+{
+ u64 counter = pmc->counter;
+
+ if (!pmc->perf_event)
+ return;
+
+ /* update counter, reset event value to avoid redundant accumulation */
+ counter += perf_event_pause(pmc->perf_event, true);
+ pmc->counter = counter & pmc_bitmask(pmc);
+}
+
+static bool pmc_resume_counter(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event)
+ return false;
+
+ /* recalibrate sample period and check if it's accepted by perf core */
+ if (perf_event_period(pmc->perf_event,
+ (-pmc->counter) & pmc_bitmask(pmc)))
+ return false;
+
+ /* reuse perf_event to serve as pmc_reprogram_counter() does*/
+ perf_event_enable(pmc->perf_event);
+
+ clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
+ return true;
}
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
@@ -154,7 +182,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
pmc->eventsel = eventsel;
- pmc_stop_counter(pmc);
+ pmc_pause_counter(pmc);
if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
return;
@@ -193,6 +221,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
if (type == PERF_TYPE_RAW)
config = eventsel & X86_RAW_EVENT_MASK;
+ if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
+ return;
+
+ pmc_release_perf_event(pmc);
+
+ pmc->current_config = eventsel;
pmc_reprogram_counter(pmc, type, config,
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
@@ -209,7 +243,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
struct kvm_pmu_event_filter *filter;
struct kvm *kvm = pmc->vcpu->kvm;
- pmc_stop_counter(pmc);
+ pmc_pause_counter(pmc);
if (!en_field || !pmc_is_enabled(pmc))
return;
@@ -224,6 +258,12 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
return;
}
+ if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc))
+ return;
+
+ pmc_release_perf_event(pmc);
+
+ pmc->current_config = (u64)ctrl;
pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
kvm_x86_ops->pmu_ops->find_fixed_event(idx),
!(en_field & 0x2), /* exclude user */
@@ -253,27 +293,32 @@ EXPORT_SYMBOL_GPL(reprogram_counter);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- u64 bitmask;
int bit;
- bitmask = pmu->reprogram_pmi;
-
- for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+ for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
if (unlikely(!pmc || !pmc->perf_event)) {
- clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+ clear_bit(bit, pmu->reprogram_pmi);
continue;
}
reprogram_counter(pmu, bit);
}
+
+ /*
+ * Unused perf_events are only released if the corresponding MSRs
+ * weren't accessed during the last vCPU time slice. kvm_arch_sched_in
+ * triggers KVM_REQ_PMU if cleanup is needed.
+ */
+ if (unlikely(pmu->need_cleanup))
+ kvm_pmu_cleanup(vcpu);
}
/* check if idx is a valid index to access PMU */
-int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
- return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
+ return kvm_x86_ops->pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -323,7 +368,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
- pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask);
+ pmc = kvm_x86_ops->pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask);
if (!pmc)
return 1;
@@ -339,7 +384,17 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
- return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+ return kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr) ||
+ kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+}
+
+static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr);
+
+ if (pmc)
+ __set_bit(pmc->idx, pmu->pmc_in_use);
}
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
@@ -349,6 +404,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
+ kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
}
@@ -376,9 +432,45 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
memset(pmu, 0, sizeof(*pmu));
kvm_x86_ops->pmu_ops->init(vcpu);
init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
+ pmu->event_count = 0;
+ pmu->need_cleanup = false;
kvm_pmu_refresh(vcpu);
}
+static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (pmc_is_fixed(pmc))
+ return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
+
+ return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
+}
+
+/* Release perf_events for vPMCs that have been unused for a full time slice. */
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = NULL;
+ DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
+ int i;
+
+ pmu->need_cleanup = false;
+
+ bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
+ pmu->pmc_in_use, X86_PMC_IDX_MAX);
+
+ for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
+ pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, i);
+
+ if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
+ pmc_stop_counter(pmc);
+ }
+
+ bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
+}
+
void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_pmu_reset(vcpu);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 58265f761c3b..7ebb62326c14 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -25,9 +25,10 @@ struct kvm_pmu_ops {
unsigned (*find_fixed_event)(int idx);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
- struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx,
- u64 *mask);
- int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
+ struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -55,12 +56,21 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
return counter & pmc_bitmask(pmc);
}
-static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+static inline void pmc_release_perf_event(struct kvm_pmc *pmc)
{
if (pmc->perf_event) {
- pmc->counter = pmc_read_counter(pmc);
perf_event_release_kernel(pmc->perf_event);
pmc->perf_event = NULL;
+ pmc->current_config = 0;
+ pmc_to_pmu(pmc)->event_count--;
+ }
+}
+
+static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = pmc_read_counter(pmc);
+ pmc_release_perf_event(pmc);
}
}
@@ -79,6 +89,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc);
}
+static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
+ u64 data)
+{
+ return !(pmu->global_ctrl_mask & data);
+}
+
/* returns general purpose PMC with the specified MSR. Note that it can be
* used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
* paramenter to tell them apart.
@@ -110,13 +126,14 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
-int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
+int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
void kvm_pmu_reset(struct kvm_vcpu *vcpu);
void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index c8388389a3b0..ce0b10fe5e2b 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -174,7 +174,7 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
}
/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
-static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -184,7 +184,8 @@ static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
}
/* idx is the ECX register of RDPMC instruction */
-static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *mask)
+static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *counters;
@@ -199,13 +200,19 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u
static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
+ /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */
+ return false;
+}
+
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- int ret = false;
+ struct kvm_pmc *pmc;
- ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) ||
- get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ pmc = pmc ? pmc : get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
- return ret;
+ return pmc;
}
static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
@@ -272,6 +279,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->nr_arch_fixed_counters = 0;
pmu->global_status = 0;
+ bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
}
static void amd_pmu_init(struct kvm_vcpu *vcpu)
@@ -285,6 +293,7 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
}
}
@@ -306,8 +315,9 @@ struct kvm_pmu_ops amd_pmu_ops = {
.find_fixed_event = amd_find_fixed_event,
.pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+ .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = amd_msr_idx_to_pmc,
- .is_valid_msr_idx = amd_is_valid_msr_idx,
+ .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx,
.is_valid_msr = amd_is_valid_msr,
.get_msr = amd_pmu_get_msr,
.set_msr = amd_pmu_set_msr,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c5673bda4b66..362e874297e4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -38,6 +38,7 @@
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include <linux/rwsem.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -418,9 +419,13 @@ enum {
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+static int sev_flush_asids(void);
+static DECLARE_RWSEM(sev_deactivate_lock);
+static DEFINE_MUTEX(sev_bitmap_lock);
static unsigned int max_sev_asid;
static unsigned int min_sev_asid;
static unsigned long *sev_asid_bitmap;
+static unsigned long *sev_reclaim_asid_bitmap;
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
struct enc_region {
@@ -1235,11 +1240,15 @@ static __init int sev_hardware_setup(void)
/* Minimum ASID value that should be used for SEV guest */
min_sev_asid = cpuid_edx(0x8000001F);
- /* Initialize SEV ASID bitmap */
+ /* Initialize SEV ASID bitmaps */
sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
if (!sev_asid_bitmap)
return 1;
+ sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+ if (!sev_reclaim_asid_bitmap)
+ return 1;
+
status = kmalloc(sizeof(*status), GFP_KERNEL);
if (!status)
return 1;
@@ -1418,8 +1427,12 @@ static __exit void svm_hardware_unsetup(void)
{
int cpu;
- if (svm_sev_enabled())
+ if (svm_sev_enabled()) {
bitmap_free(sev_asid_bitmap);
+ bitmap_free(sev_reclaim_asid_bitmap);
+
+ sev_flush_asids();
+ }
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
@@ -1729,25 +1742,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
-static void __sev_asid_free(int asid)
+static void sev_asid_free(int asid)
{
struct svm_cpu_data *sd;
int cpu, pos;
+ mutex_lock(&sev_bitmap_lock);
+
pos = asid - 1;
- clear_bit(pos, sev_asid_bitmap);
+ __set_bit(pos, sev_reclaim_asid_bitmap);
for_each_possible_cpu(cpu) {
sd = per_cpu(svm_data, cpu);
sd->sev_vmcbs[pos] = NULL;
}
-}
-static void sev_asid_free(struct kvm *kvm)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
-
- __sev_asid_free(sev->asid);
+ mutex_unlock(&sev_bitmap_lock);
}
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -1764,10 +1774,12 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
/* deactivate handle */
data->handle = handle;
+
+ /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
+ down_read(&sev_deactivate_lock);
sev_guest_deactivate(data, NULL);
+ up_read(&sev_deactivate_lock);
- wbinvd_on_all_cpus();
- sev_guest_df_flush(NULL);
kfree(data);
decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
@@ -1916,7 +1928,7 @@ static void sev_vm_destroy(struct kvm *kvm)
mutex_unlock(&kvm->lock);
sev_unbind_asid(kvm, sev->handle);
- sev_asid_free(kvm);
+ sev_asid_free(sev->asid);
}
static void avic_vm_destroy(struct kvm *kvm)
@@ -2370,7 +2382,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
}
}
@@ -2523,10 +2535,6 @@ static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
}
-static void svm_decache_cr3(struct kvm_vcpu *vcpu)
-{
-}
-
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
@@ -4997,6 +5005,18 @@ static int handle_exit(struct kvm_vcpu *vcpu)
return 0;
}
+#ifdef CONFIG_RETPOLINE
+ if (exit_code == SVM_EXIT_MSR)
+ return msr_interception(svm);
+ else if (exit_code == SVM_EXIT_VINTR)
+ return interrupt_window_interception(svm);
+ else if (exit_code == SVM_EXIT_INTR)
+ return intr_interception(svm);
+ else if (exit_code == SVM_EXIT_HLT)
+ return halt_interception(svm);
+ else if (exit_code == SVM_EXIT_NPF)
+ return npf_interception(svm);
+#endif
return svm_exit_handlers[exit_code](svm);
}
@@ -5092,8 +5112,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm_nested_virtualize_tpr(vcpu) ||
- kvm_vcpu_apicv_active(vcpu))
+ if (svm_nested_virtualize_tpr(vcpu))
return;
clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
@@ -5110,9 +5129,9 @@ static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
return;
}
-static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
+static bool svm_get_enable_apicv(struct kvm *kvm)
{
- return avic && irqchip_split(vcpu->kvm);
+ return avic && irqchip_split(kvm);
}
static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
@@ -5634,7 +5653,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
svm->vmcb->save.cr2 = vcpu->arch.cr2;
clgi();
- kvm_load_guest_xcr0(vcpu);
+ kvm_load_guest_xsave_state(vcpu);
if (lapic_in_kernel(vcpu) &&
vcpu->arch.apic->lapic_timer.timer_advance_ns)
@@ -5784,7 +5803,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(&svm->vcpu);
- kvm_put_guest_xcr0(vcpu);
+ kvm_load_host_xsave_state(vcpu);
stgi();
/* Any pending NMI will happen here */
@@ -5893,6 +5912,9 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVES);
+
/* Update nrips enabled cache */
svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
@@ -5968,7 +5990,7 @@ static bool svm_mpx_supported(void)
static bool svm_xsaves_supported(void)
{
- return false;
+ return boot_cpu_has(X86_FEATURE_XSAVES);
}
static bool svm_umip_emulated(void)
@@ -6270,18 +6292,73 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
return 0;
}
+static int sev_flush_asids(void)
+{
+ int ret, error;
+
+ /*
+ * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail,
+ * so it must be guarded.
+ */
+ down_write(&sev_deactivate_lock);
+
+ wbinvd_on_all_cpus();
+ ret = sev_guest_df_flush(&error);
+
+ up_write(&sev_deactivate_lock);
+
+ if (ret)
+ pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error);
+
+ return ret;
+}
+
+/* Must be called with the sev_bitmap_lock held */
+static bool __sev_recycle_asids(void)
+{
+ int pos;
+
+ /* Check if there are any ASIDs to reclaim before performing a flush */
+ pos = find_next_bit(sev_reclaim_asid_bitmap,
+ max_sev_asid, min_sev_asid - 1);
+ if (pos >= max_sev_asid)
+ return false;
+
+ if (sev_flush_asids())
+ return false;
+
+ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
+ max_sev_asid);
+ bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid);
+
+ return true;
+}
+
static int sev_asid_new(void)
{
+ bool retry = true;
int pos;
+ mutex_lock(&sev_bitmap_lock);
+
/*
* SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
*/
+again:
pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
- if (pos >= max_sev_asid)
+ if (pos >= max_sev_asid) {
+ if (retry && __sev_recycle_asids()) {
+ retry = false;
+ goto again;
+ }
+ mutex_unlock(&sev_bitmap_lock);
return -EBUSY;
+ }
+
+ __set_bit(pos, sev_asid_bitmap);
+
+ mutex_unlock(&sev_bitmap_lock);
- set_bit(pos, sev_asid_bitmap);
return pos + 1;
}
@@ -6309,7 +6386,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
return 0;
e_free:
- __sev_asid_free(asid);
+ sev_asid_free(asid);
return ret;
}
@@ -6319,12 +6396,6 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
int asid = sev_get_asid(kvm);
int ret;
- wbinvd_on_all_cpus();
-
- ret = sev_guest_df_flush(error);
- if (ret)
- return ret;
-
data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -7214,7 +7285,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
- .decache_cr3 = svm_decache_cr3,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
.set_cr3 = svm_set_cr3,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0e7c9301fe86..4aea7d304beb 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -10,6 +10,7 @@
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
+#include "pmu.h"
#include "trace.h"
#include "x86.h"
@@ -27,6 +28,16 @@ module_param(nested_early_check, bool, S_IRUGO);
failed; \
})
+#define SET_MSR_OR_WARN(vcpu, idx, data) \
+({ \
+ bool failed = kvm_set_msr(vcpu, idx, data); \
+ if (failed) \
+ pr_warn_ratelimited( \
+ "%s cannot write MSR (0x%x, 0x%llx)\n", \
+ __func__, idx, data); \
+ failed; \
+})
+
/*
* Hyper-V requires all of these, so mark them as supported even though
* they are just treated the same as all-context.
@@ -257,7 +268,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.cached_shadow_vmcs12 = NULL;
/* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
- kvm_release_page_dirty(vmx->nested.apic_access_page);
+ kvm_release_page_clean(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
@@ -929,6 +940,57 @@ fail:
return i + 1;
}
+static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
+ u32 msr_index,
+ u64 *data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * If the L0 hypervisor stored a more accurate value for the TSC that
+ * does not include the time taken for emulation of the L2->L1
+ * VM-exit in L0, use the more accurate value.
+ */
+ if (msr_index == MSR_IA32_TSC) {
+ int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
+ MSR_IA32_TSC);
+
+ if (index >= 0) {
+ u64 val = vmx->msr_autostore.guest.val[index].value;
+
+ *data = kvm_read_l1_tsc(vcpu, val);
+ return true;
+ }
+ }
+
+ if (kvm_get_msr(vcpu, msr_index, data)) {
+ pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
+ msr_index);
+ return false;
+ }
+ return true;
+}
+
+static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
+ struct vmx_msr_entry *e)
+{
+ if (kvm_vcpu_read_guest(vcpu,
+ gpa + i * sizeof(*e),
+ e, 2 * sizeof(u32))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(*e));
+ return false;
+ }
+ if (nested_vmx_store_msr_check(vcpu, e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e->index, e->reserved);
+ return false;
+ }
+ return true;
+}
+
static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{
u64 data;
@@ -940,26 +1002,12 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
if (unlikely(i >= max_msr_list_size))
return -EINVAL;
- if (kvm_vcpu_read_guest(vcpu,
- gpa + i * sizeof(e),
- &e, 2 * sizeof(u32))) {
- pr_debug_ratelimited(
- "%s cannot read MSR entry (%u, 0x%08llx)\n",
- __func__, i, gpa + i * sizeof(e));
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
return -EINVAL;
- }
- if (nested_vmx_store_msr_check(vcpu, &e)) {
- pr_debug_ratelimited(
- "%s check failed (%u, 0x%x, 0x%x)\n",
- __func__, i, e.index, e.reserved);
- return -EINVAL;
- }
- if (kvm_get_msr(vcpu, e.index, &data)) {
- pr_debug_ratelimited(
- "%s cannot read MSR (%u, 0x%x)\n",
- __func__, i, e.index);
+
+ if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
return -EINVAL;
- }
+
if (kvm_vcpu_write_guest(vcpu,
gpa + i * sizeof(e) +
offsetof(struct vmx_msr_entry, value),
@@ -973,6 +1021,60 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
return 0;
}
+static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 count = vmcs12->vm_exit_msr_store_count;
+ u64 gpa = vmcs12->vm_exit_msr_store_addr;
+ struct vmx_msr_entry e;
+ u32 i;
+
+ for (i = 0; i < count; i++) {
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
+ return false;
+
+ if (e.index == msr_index)
+ return true;
+ }
+ return false;
+}
+
+static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
+ u32 msr_index)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
+ bool in_vmcs12_store_list;
+ int msr_autostore_index;
+ bool in_autostore_list;
+ int last;
+
+ msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
+ in_autostore_list = msr_autostore_index >= 0;
+ in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
+
+ if (in_vmcs12_store_list && !in_autostore_list) {
+ if (autostore->nr == NR_LOADSTORE_MSRS) {
+ /*
+ * Emulated VMEntry does not fail here. Instead a less
+ * accurate value will be returned by
+ * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
+ * instead of reading the value from the vmcs02 VMExit
+ * MSR-store area.
+ */
+ pr_warn_ratelimited(
+ "Not enough msr entries in msr_autostore. Can't add msr %x\n",
+ msr_index);
+ return;
+ }
+ last = autostore->nr++;
+ autostore->val[last].index = msr_index;
+ } else if (!in_vmcs12_store_list && in_autostore_list) {
+ last = --autostore->nr;
+ autostore->val[msr_autostore_index] = autostore->val[last];
+ }
+}
+
static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
unsigned long invalid_mask;
@@ -1012,7 +1114,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
kvm_mmu_new_cr3(vcpu, cr3, false);
vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
kvm_init_mmu(vcpu, false);
@@ -1024,7 +1126,9 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
* populated by L2 differently than TLB entries populated
* by L1.
*
- * If L1 uses EPT, then TLB entries are tagged with different EPTP.
+ * If L0 uses EPT, L1 and L2 run with different EPTP because
+ * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
+ * are tagged with different EPTP.
*
* If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
* with different VPID (L1 entries are tagged with vmx->vpid
@@ -1034,7 +1138,7 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- return nested_cpu_has_ept(vmcs12) ||
+ return enable_ept ||
(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
}
@@ -2018,7 +2122,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
* addresses are constant (for vmcs02), the counts can change based
* on L2's behavior, e.g. switching to/from long mode.
*/
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
@@ -2073,6 +2177,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
+ vmx->nested.l1_tpr_threshold = -1;
if (exec_control & CPU_BASED_TPR_SHADOW)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
#ifdef CONFIG_X86_64
@@ -2285,6 +2390,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
}
+ /*
+ * Make sure the msr_autostore list is up to date before we set the
+ * count in the vmcs02.
+ */
+ prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
@@ -2381,9 +2493,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (nested_cpu_has_ept(vmcs12))
nested_ept_init_mmu_context(vcpu);
- else if (nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
- vmx_flush_tlb(vcpu, true);
/*
* This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
@@ -2418,6 +2527,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
entry_failure_code))
return -EINVAL;
+ /*
+ * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
+ * on nested VM-Exit, which can occur without actually running L2 and
+ * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
+ * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
+ * transition to HLT instead of running L2.
+ */
+ if (enable_ept)
+ vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
+
/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
is_pae_paging(vcpu)) {
@@ -2430,6 +2549,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->guest_ia32_perf_global_ctrl))
+ return -EINVAL;
+
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
return 0;
@@ -2664,6 +2788,11 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
return -EINVAL;
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->host_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
#ifdef CONFIG_X86_64
ia32e = !!(vcpu->arch.efer & EFER_LMA);
#else
@@ -2779,6 +2908,11 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
return -EINVAL;
}
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->guest_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-entry control is 1, the following checks
* are performed on the field for the IA32_EFER MSR:
@@ -2933,7 +3067,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* to it so we can release it later.
*/
if (vmx->nested.apic_access_page) { /* shouldn't happen */
- kvm_release_page_dirty(vmx->nested.apic_access_page);
+ kvm_release_page_clean(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
@@ -3461,6 +3595,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
if (block_nested_events)
return -EBUSY;
+ clear_bit(KVM_APIC_INIT, &apic->pending_events);
nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
return 0;
}
@@ -3864,8 +3999,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vcpu->arch.pat = vmcs12->host_ia32_pat;
}
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
- vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
- vmcs12->host_ia32_perf_global_ctrl);
+ SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl);
/* Set L1 segment info according to Intel SDM
27.5.2 Loading Host Segment and Descriptor-Table Registers */
@@ -3984,7 +4119,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
nested_ept_uninit_mmu_context(vcpu);
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
/*
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -4112,6 +4247,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ if (vmx->nested.l1_tpr_threshold != -1)
+ vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
@@ -4119,15 +4256,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
if (vmx->nested.change_vmcs01_virtual_apic_mode) {
vmx->nested.change_vmcs01_virtual_apic_mode = false;
vmx_set_virtual_apic_mode(vcpu);
- } else if (!nested_cpu_has_ept(vmcs12) &&
- nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- vmx_flush_tlb(vcpu, true);
}
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
- kvm_release_page_dirty(vmx->nested.apic_access_page);
+ kvm_release_page_clean(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
@@ -4327,6 +4460,27 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
return 0;
}
+void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx;
+
+ if (!nested_vmx_allowed(vcpu))
+ return;
+
+ vmx = to_vmx(vcpu);
+ if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
+ vmx->nested.msrs.entry_ctls_high |=
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmx->nested.msrs.exit_ctls_high |=
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ } else {
+ vmx->nested.msrs.entry_ctls_high &=
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmx->nested.msrs.exit_ctls_high &=
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ }
+}
+
static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
{
gva_t gva;
@@ -5766,7 +5920,7 @@ error_guest_mode:
return ret;
}
-void nested_vmx_vcpu_setup(void)
+void nested_vmx_set_vmcs_shadowing_bitmap(void)
{
if (enable_shadow_vmcs) {
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
@@ -6047,23 +6201,23 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
init_vmcs_shadow_fields();
}
- exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear,
- exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
- exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld,
- exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst,
- exit_handlers[EXIT_REASON_VMREAD] = handle_vmread,
- exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume,
- exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite,
- exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff,
- exit_handlers[EXIT_REASON_VMON] = handle_vmon,
- exit_handlers[EXIT_REASON_INVEPT] = handle_invept,
- exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid,
- exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc,
+ exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
+ exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
+ exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
+ exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
+ exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
+ exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
+ exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
+ exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
+ exit_handlers[EXIT_REASON_VMON] = handle_vmon;
+ exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
+ exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
+ exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
kvm_x86_ops->check_nested_events = vmx_check_nested_events;
kvm_x86_ops->get_nested_state = vmx_get_nested_state;
kvm_x86_ops->set_nested_state = vmx_set_nested_state;
- kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages,
+ kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages;
kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 6280f33e5fa6..fc874d4ead0f 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -21,7 +21,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
bool apicv);
void nested_vmx_hardware_unsetup(void);
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
-void nested_vmx_vcpu_setup(void);
+void nested_vmx_set_vmcs_shadowing_bitmap(void);
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
bool from_vmentry);
@@ -33,6 +33,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
+void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
@@ -256,7 +257,7 @@ static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
return ((val & fixed1) | fixed0) == val;
}
-static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
@@ -270,7 +271,7 @@ static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
return fixed_bits_valid(val, fixed0, fixed1);
}
-static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
@@ -278,7 +279,7 @@ static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
return fixed_bits_valid(val, fixed0, fixed1);
}
-static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 3e9c059099e9..7023138b1cb0 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -15,6 +15,7 @@
#include "x86.h"
#include "cpuid.h"
#include "lapic.h"
+#include "nested.h"
#include "pmu.h"
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
@@ -46,6 +47,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
if (old_ctrl == new_ctrl)
continue;
+ __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
reprogram_fixed_counter(pmc, new_ctrl, i);
}
@@ -111,7 +113,7 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
}
/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
-static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
@@ -122,8 +124,8 @@ static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
(fixed && idx >= pmu->nr_arch_fixed_counters);
}
-static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
- unsigned idx, u64 *mask)
+static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
@@ -162,6 +164,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
return ret;
}
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ pmc = get_fixed_pmc(pmu, msr);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0);
+
+ return pmc;
+}
+
static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -223,7 +237,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_CTRL:
if (pmu->global_ctrl == data)
return 0;
- if (!(data & pmu->global_ctrl_mask)) {
+ if (kvm_valid_perf_global_ctrl(pmu, data)) {
global_ctrl_changed(pmu, data);
return 0;
}
@@ -317,6 +331,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
(boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
(entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+
+ bitmap_set(pmu->all_valid_pmc_idx,
+ 0, pmu->nr_arch_gp_counters);
+ bitmap_set(pmu->all_valid_pmc_idx,
+ INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
+
+ nested_vmx_pmu_entry_exit_ctls_update(vcpu);
}
static void intel_pmu_init(struct kvm_vcpu *vcpu)
@@ -328,12 +349,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
}
for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ pmu->fixed_counters[i].current_config = 0;
}
}
@@ -366,8 +389,9 @@ struct kvm_pmu_ops intel_pmu_ops = {
.find_fixed_event = intel_find_fixed_event,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+ .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = intel_msr_idx_to_pmc,
- .is_valid_msr_idx = intel_is_valid_msr_idx,
+ .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx,
.is_valid_msr = intel_is_valid_msr,
.get_msr = intel_pmu_get_msr,
.set_msr = intel_pmu_set_msr,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 04a8212704c1..d175429c91b0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -106,8 +106,6 @@ module_param(enable_apicv, bool, S_IRUGO);
static bool __read_mostly nested = 1;
module_param(nested, bool, S_IRUGO);
-static u64 __read_mostly host_xss;
-
bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
@@ -450,6 +448,7 @@ const u32 vmx_msr_index[] = {
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
};
#if IS_ENABLED(CONFIG_HYPERV)
@@ -638,6 +637,23 @@ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
return NULL;
}
+static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
+{
+ int ret = 0;
+
+ u64 old_msr_data = msr->data;
+ msr->data = data;
+ if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
+ preempt_disable();
+ ret = kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
+ preempt_enable();
+ if (ret)
+ msr->data = old_msr_data;
+ }
+ return ret;
+}
+
void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
{
vmcs_clear(loaded_vmcs->vmcs);
@@ -726,8 +742,8 @@ static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
bool ret;
u32 mask = 1 << (seg * SEG_FIELD_NR + field);
- if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
- vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
+ if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
+ kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
vmx->segment_cache.bitmask = 0;
}
ret = vmx->segment_cache.bitmask & mask;
@@ -835,7 +851,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
vm_exit_controls_clearbit(vmx, exit);
}
-static int find_msr(struct vmx_msrs *m, unsigned int msr)
+int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
{
unsigned int i;
@@ -869,7 +885,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
}
break;
}
- i = find_msr(&m->guest, msr);
+ i = vmx_find_msr_index(&m->guest, msr);
if (i < 0)
goto skip_guest;
--m->guest.nr;
@@ -877,7 +893,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
skip_guest:
- i = find_msr(&m->host, msr);
+ i = vmx_find_msr_index(&m->host, msr);
if (i < 0)
return;
@@ -936,12 +952,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
- i = find_msr(&m->guest, msr);
+ i = vmx_find_msr_index(&m->guest, msr);
if (!entry_only)
- j = find_msr(&m->host, msr);
+ j = vmx_find_msr_index(&m->host, msr);
- if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
- (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) {
+ if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
+ (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
@@ -1418,35 +1434,44 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long rflags, save_rflags;
- if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
rflags = vmcs_readl(GUEST_RFLAGS);
- if (to_vmx(vcpu)->rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
- save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ save_rflags = vmx->rmode.save_rflags;
rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
}
- to_vmx(vcpu)->rflags = rflags;
+ vmx->rflags = rflags;
}
- return to_vmx(vcpu)->rflags;
+ return vmx->rflags;
}
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- unsigned long old_rflags = vmx_get_rflags(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long old_rflags;
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
- to_vmx(vcpu)->rflags = rflags;
- if (to_vmx(vcpu)->rmode.vm86_active) {
- to_vmx(vcpu)->rmode.save_rflags = rflags;
+ if (enable_unrestricted_guest) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ vmx->rflags = rflags;
+ vmcs_writel(GUEST_RFLAGS, rflags);
+ return;
+ }
+
+ old_rflags = vmx_get_rflags(vcpu);
+ vmx->rflags = rflags;
+ if (vmx->rmode.vm86_active) {
+ vmx->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
}
vmcs_writel(GUEST_RFLAGS, rflags);
- if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
- to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
+ if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
+ vmx->emulation_required = emulation_required(vcpu);
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -1683,6 +1708,9 @@ static void setup_msrs(struct vcpu_vmx *vmx)
index = __find_msr_index(vmx, MSR_TSC_AUX);
if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
vmx->save_nmsrs = save_nmsrs;
vmx->guest_msrs_ready = false;
@@ -1782,6 +1810,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ goto find_shared_msr;
case MSR_IA32_UMWAIT_CONTROL:
if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
return 1;
@@ -1826,14 +1859,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data);
- case MSR_IA32_XSS:
- if (!vmx_xsaves_supported() ||
- (!msr_info->host_initiated &&
- !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
- return 1;
- msr_info->data = vcpu->arch.ia32_xss;
- break;
case MSR_IA32_RTIT_CTL:
if (pt_mode != PT_MODE_HOST_GUEST)
return 1;
@@ -1884,8 +1909,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
- /* Else, falls through */
+ goto find_shared_msr;
default:
+ find_shared_msr:
msr = find_msr_entry(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
@@ -2001,6 +2027,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
MSR_IA32_SPEC_CTRL,
MSR_TYPE_RW);
break;
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
+ return 1;
+ goto find_shared_msr;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
@@ -2069,25 +2102,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
- case MSR_IA32_XSS:
- if (!vmx_xsaves_supported() ||
- (!msr_info->host_initiated &&
- !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
- return 1;
- /*
- * The only supported bit as of Skylake is bit 8, but
- * it is not supported on KVM.
- */
- if (data != 0)
- return 1;
- vcpu->arch.ia32_xss = data;
- if (vcpu->arch.ia32_xss != host_xss)
- add_atomic_switch_msr(vmx, MSR_IA32_XSS,
- vcpu->arch.ia32_xss, host_xss, false);
- else
- clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
- break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
vmx_rtit_ctl_check(vcpu, data) ||
@@ -2152,23 +2166,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
- /* Else, falls through */
+ goto find_shared_msr;
+
default:
+ find_shared_msr:
msr = find_msr_entry(vmx, msr_index);
- if (msr) {
- u64 old_msr_data = msr->data;
- msr->data = data;
- if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
- preempt_disable();
- ret = kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
- preempt_enable();
- if (ret)
- msr->data = old_msr_data;
- }
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_info);
+ if (msr)
+ ret = vmx_set_guest_msr(vmx, msr, data);
+ else
+ ret = kvm_set_msr_common(vcpu, msr_info);
}
return ret;
@@ -2176,7 +2182,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, reg);
+
switch (reg) {
case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
@@ -2188,7 +2195,12 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
if (enable_ept)
ept_save_pdptrs(vcpu);
break;
+ case VCPU_EXREG_CR3:
+ if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ break;
default:
+ WARN_ON_ONCE(1);
break;
}
}
@@ -2859,13 +2871,6 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
-static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
-{
- if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-}
-
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -2878,8 +2883,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty))
+ if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
return;
if (is_pae_paging(vcpu)) {
@@ -2901,10 +2905,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
}
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
}
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -2913,8 +2914,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- vmx_decache_cr3(vcpu);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
@@ -2995,6 +2996,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
struct kvm *kvm = vcpu->kvm;
+ bool update_guest_cr3 = true;
unsigned long guest_cr3;
u64 eptp;
@@ -3011,15 +3013,20 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
}
- if (enable_unrestricted_guest || is_paging(vcpu) ||
- is_guest_mode(vcpu))
- guest_cr3 = kvm_read_cr3(vcpu);
- else
+ /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
+ if (is_guest_mode(vcpu))
+ update_guest_cr3 = false;
+ else if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
+ else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ guest_cr3 = vcpu->arch.cr3;
+ else /* vmcs01.GUEST_CR3 is already up-to-date. */
+ update_guest_cr3 = false;
ept_load_pdptrs(vcpu);
}
- vmcs_writel(GUEST_CR3, guest_cr3);
+ if (update_guest_cr3)
+ vmcs_writel(GUEST_CR3, guest_cr3);
}
int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -3753,7 +3760,7 @@ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
}
}
-static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
+static bool vmx_get_enable_apicv(struct kvm *kvm)
{
return enable_apicv;
}
@@ -4046,6 +4053,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
+ vcpu->arch.xsaves_enabled = xsaves_enabled;
+
if (!xsaves_enabled)
exec_control &= ~SECONDARY_EXEC_XSAVES;
@@ -4158,14 +4167,13 @@ static void ept_set_mmio_spte_mask(void)
#define VMX_XSS_EXIT_BITMAP 0
/*
- * Sets up the vmcs for emulated real mode.
+ * Noting that the initialization of Guest-state Area of VMCS is in
+ * vmx_vcpu_reset().
*/
-static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
+static void init_vmcs(struct vcpu_vmx *vmx)
{
- int i;
-
if (nested)
- nested_vmx_vcpu_setup();
+ nested_vmx_set_vmcs_shadowing_bitmap();
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
@@ -4174,7 +4182,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
/* Control */
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
- vmx->hv_deadline_tsc = -1;
exec_controls_set(vmx, vmx_exec_control(vmx));
@@ -4223,21 +4230,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
- u32 index = vmx_msr_index[i];
- u32 data_low, data_high;
- int j = vmx->nmsrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
- vmx->guest_msrs[j].index = i;
- vmx->guest_msrs[j].data = 0;
- vmx->guest_msrs[j].mask = -1ull;
- ++vmx->nmsrs;
- }
-
vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
@@ -4248,6 +4240,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
set_cr4_guest_host_mask(vmx);
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
if (vmx_xsaves_supported())
vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
@@ -4350,9 +4345,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (vmx->vpid != 0)
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-
cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
vmx->vcpu.arch.cr0 = cr0;
vmx_set_cr0(vcpu, cr0); /* enter rmode */
@@ -4707,7 +4699,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
return 0;
}
-static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.irq_exits;
return 1;
@@ -4979,21 +4971,6 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
vmcs_writel(GUEST_DR7, val);
}
-static int handle_cpuid(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_cpuid(vcpu);
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_rdmsr(vcpu);
-}
-
-static int handle_wrmsr(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_wrmsr(vcpu);
-}
-
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
kvm_apic_update_ppr(vcpu);
@@ -5010,11 +4987,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_halt(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_halt(vcpu);
-}
-
static int handle_vmcall(struct kvm_vcpu *vcpu)
{
return kvm_emulate_hypercall(vcpu);
@@ -5562,11 +5534,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
- [EXIT_REASON_CPUID] = handle_cpuid,
- [EXIT_REASON_MSR_READ] = handle_rdmsr,
- [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
+ [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
+ [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
- [EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_HLT] = kvm_emulate_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_RDPMC] = handle_rdpmc,
@@ -5939,9 +5911,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
if (exit_reason < kvm_vmx_max_exit_handlers
- && kvm_vmx_exit_handlers[exit_reason])
+ && kvm_vmx_exit_handlers[exit_reason]) {
+#ifdef CONFIG_RETPOLINE
+ if (exit_reason == EXIT_REASON_MSR_WRITE)
+ return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
+ return handle_preemption_timer(vcpu);
+ else if (exit_reason == EXIT_REASON_PENDING_INTERRUPT)
+ return handle_interrupt_window(vcpu);
+ else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ return handle_external_interrupt(vcpu);
+ else if (exit_reason == EXIT_REASON_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
+ return handle_ept_misconfig(vcpu);
+#endif
return kvm_vmx_exit_handlers[exit_reason](vcpu);
- else {
+ } else {
vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
exit_reason);
dump_vmcs();
@@ -6027,17 +6013,17 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ int tpr_threshold;
if (is_guest_mode(vcpu) &&
nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
return;
- if (irr == -1 || tpr < irr) {
- vmcs_write32(TPR_THRESHOLD, 0);
- return;
- }
-
- vmcs_write32(TPR_THRESHOLD, irr);
+ tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
+ if (is_guest_mode(vcpu))
+ to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
+ else
+ vmcs_write32(TPR_THRESHOLD, tpr_threshold);
}
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
@@ -6514,9 +6500,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
- if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
cr3 = __get_current_cr3_fast();
@@ -6539,7 +6525,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- kvm_load_guest_xcr0(vcpu);
+ kvm_load_guest_xsave_state(vcpu);
if (static_cpu_has(X86_FEATURE_PKU) &&
kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
@@ -6646,7 +6632,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
__write_pkru(vmx->host_pkru);
}
- kvm_put_guest_xcr0(vcpu);
+ kvm_load_host_xsave_state(vcpu);
vmx->nested.nested_run_pending = 0;
vmx->idt_vectoring_info = 0;
@@ -6700,7 +6686,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
int err;
struct vcpu_vmx *vmx;
unsigned long *msr_bitmap;
- int cpu;
+ int i, cpu;
BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
"struct kvm_vcpu must be at offset 0 for arch usercopy region");
@@ -6752,6 +6738,34 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx->guest_msrs)
goto free_pml;
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
+ u32 index = vmx_msr_index[i];
+ u32 data_low, data_high;
+ int j = vmx->nmsrs;
+
+ if (rdmsr_safe(index, &data_low, &data_high) < 0)
+ continue;
+ if (wrmsr_safe(index, data_low, data_high) < 0)
+ continue;
+
+ vmx->guest_msrs[j].index = i;
+ vmx->guest_msrs[j].data = 0;
+ switch (index) {
+ case MSR_IA32_TSX_CTRL:
+ /*
+ * No need to pass TSX_CTRL_CPUID_CLEAR through, so
+ * let's avoid changing CPUID bits under the host
+ * kernel's feet.
+ */
+ vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ break;
+ default:
+ vmx->guest_msrs[j].mask = -1ull;
+ break;
+ }
+ ++vmx->nmsrs;
+ }
+
err = alloc_loaded_vmcs(&vmx->vmcs01);
if (err < 0)
goto free_msrs;
@@ -6776,7 +6790,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu;
- vmx_vcpu_setup(vmx);
+ init_vmcs(vmx);
vmx_vcpu_put(&vmx->vcpu);
put_cpu();
if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
@@ -6996,6 +7010,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
+ cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57));
#undef cr4_fixed1_update
}
@@ -7090,6 +7105,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
+ vcpu->arch.xsaves_enabled = false;
+
if (cpu_has_secondary_exec_ctrls()) {
vmx_compute_secondary_exec_control(vmx);
vmcs_set_secondary_exec_control(vmx);
@@ -7097,10 +7115,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (nested_vmx_allowed(vcpu))
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
else
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
- ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ ~(FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
+ FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX);
if (nested_vmx_allowed(vcpu)) {
nested_vmx_cr_fixed1_bits_update(vcpu);
@@ -7110,6 +7130,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
update_intel_pt_cfg(vcpu);
+
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ struct shared_msr_entry *msr;
+ msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
+ if (msr) {
+ bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
+ vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+ }
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7598,9 +7627,6 @@ static __init int hardware_setup(void)
WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
}
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
-
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
enable_vpid = 0;
@@ -7781,7 +7807,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
- .decache_cr3 = vmx_decache_cr3,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 5a0f34b1e226..7c1b978b2df4 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -22,11 +22,11 @@ extern u32 get_umwait_control_msr(void);
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
-#define NR_AUTOLOAD_MSRS 8
+#define NR_LOADSTORE_MSRS 8
struct vmx_msrs {
unsigned int nr;
- struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
+ struct vmx_msr_entry val[NR_LOADSTORE_MSRS];
};
struct shared_msr_entry {
@@ -167,6 +167,9 @@ struct nested_vmx {
u64 vmcs01_debugctl;
u64 vmcs01_guest_bndcfgs;
+ /* to migrate it to L1 if L2 writes to L1's CR8 directly */
+ int l1_tpr_threshold;
+
u16 vpid02;
u16 last_vpid;
@@ -230,6 +233,10 @@ struct vcpu_vmx {
struct vmx_msrs host;
} msr_autoload;
+ struct msr_autostore {
+ struct vmx_msrs guest;
+ } msr_autostore;
+
struct {
int vm86_active;
ulong save_rflags;
@@ -334,6 +341,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
+int vmx_find_msr_index(struct vmx_msrs *m, u32 msr);
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5d530521f11d..3ed167e039e5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -176,6 +176,8 @@ struct kvm_shared_msrs {
static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
static struct kvm_shared_msrs __percpu *shared_msrs;
+static u64 __read_mostly host_xss;
+
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
{ "pf_guest", VCPU_STAT(pf_guest) },
@@ -260,23 +262,6 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
}
}
-static void shared_msr_update(unsigned slot, u32 msr)
-{
- u64 value;
- unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
-
- /* only read, and nobody should modify it at this time,
- * so don't need lock */
- if (slot >= shared_msrs_global.nr) {
- printk(KERN_ERR "kvm: invalid MSR slot!");
- return;
- }
- rdmsrl_safe(msr, &value);
- smsr->values[slot].host = value;
- smsr->values[slot].curr = value;
-}
-
void kvm_define_shared_msr(unsigned slot, u32 msr)
{
BUG_ON(slot >= KVM_NR_SHARED_MSRS);
@@ -288,10 +273,16 @@ EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
static void kvm_shared_msr_cpu_online(void)
{
- unsigned i;
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ u64 value;
+ int i;
- for (i = 0; i < shared_msrs_global.nr; ++i)
- shared_msr_update(i, shared_msrs_global.msrs[i]);
+ for (i = 0; i < shared_msrs_global.nr; ++i) {
+ rdmsrl_safe(shared_msrs_global.msrs[i], &value);
+ smsr->values[i].host = value;
+ smsr->values[i].curr = value;
+ }
}
int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
@@ -300,13 +291,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
int err;
- if (((value ^ smsr->values[slot].curr) & mask) == 0)
+ value = (value & mask) | (smsr->values[slot].host & ~mask);
+ if (value == smsr->values[slot].curr)
return 0;
- smsr->values[slot].curr = value;
err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
if (err)
return 1;
+ smsr->values[slot].curr = value;
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
@@ -709,10 +701,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
ret = 1;
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+
out:
return ret;
@@ -722,7 +712,6 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
- bool changed = true;
int offset;
gfn_t gfn;
int r;
@@ -730,8 +719,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (!is_pae_paging(vcpu))
return false;
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail))
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
return true;
gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
@@ -739,11 +727,9 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu)
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
- goto out;
- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-out:
+ return true;
- return changed;
+ return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
}
EXPORT_SYMBOL_GPL(pdptrs_changed);
@@ -812,27 +798,34 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
- !vcpu->guest_xcr0_loaded) {
- /* kvm_set_xcr() also depends on this */
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- vcpu->guest_xcr0_loaded = 1;
+
+ if (vcpu->arch.xsaves_enabled &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
}
-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
- if (vcpu->guest_xcr0_loaded) {
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- vcpu->guest_xcr0_loaded = 0;
+
+ if (vcpu->arch.xsaves_enabled &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, host_xss);
}
+
}
-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
@@ -984,7 +977,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
return 0;
}
@@ -1313,23 +1306,15 @@ static u64 kvm_get_arch_capabilities(void)
data |= ARCH_CAP_MDS_NO;
/*
- * On TAA affected systems, export MDS_NO=0 when:
- * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
- * - Updated microcode is present. This is detected by
- * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
- * that VERW clears CPU buffers.
- *
- * When MDS_NO=0 is exported, guests deploy clear CPU buffer
- * mitigation and don't complain:
- *
- * "Vulnerable: Clear CPU buffers attempted, no microcode"
- *
- * If TSX is disabled on the system, guests are also mitigated against
- * TAA and clear CPU buffer mitigation is not required for guests.
+ * On TAA affected systems:
+ * - nothing to do if TSX is disabled on the host.
+ * - we emulate TSX_CTRL if present on the host.
+ * This lets the guest use VERW to clear CPU buffers.
*/
- if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&
- (data & ARCH_CAP_TSX_CTRL_MSR))
- data &= ~ARCH_CAP_MDS_NO;
+ if (!boot_cpu_has(X86_FEATURE_RTM))
+ data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
+ else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ data |= ARCH_CAP_TAA_NO;
return data;
}
@@ -1477,8 +1462,8 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
- bool host_initiated)
+int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
{
struct msr_data msr;
int ret;
@@ -1553,20 +1538,25 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
}
#ifdef CONFIG_X86_64
+struct pvclock_clock {
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+};
+
struct pvclock_gtod_data {
seqcount_t seq;
- struct { /* extract of a clocksource struct */
- int vclock_mode;
- u64 cycle_last;
- u64 mask;
- u32 mult;
- u32 shift;
- } clock;
+ struct pvclock_clock clock; /* extract of a clocksource struct */
+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
+ u64 boot_ns_raw;
u64 boot_ns;
u64 nsec_base;
u64 wall_time_sec;
+ u64 monotonic_raw_nsec;
};
static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1574,9 +1564,10 @@ static struct pvclock_gtod_data pvclock_gtod_data;
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
- u64 boot_ns;
+ u64 boot_ns, boot_ns_raw;
boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
+ boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
@@ -1587,11 +1578,20 @@ static void update_pvclock_gtod(struct timekeeper *tk)
vdata->clock.mult = tk->tkr_mono.mult;
vdata->clock.shift = tk->tkr_mono.shift;
+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode;
+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
+ vdata->raw_clock.mask = tk->tkr_raw.mask;
+ vdata->raw_clock.mult = tk->tkr_raw.mult;
+ vdata->raw_clock.shift = tk->tkr_raw.shift;
+
vdata->boot_ns = boot_ns;
vdata->nsec_base = tk->tkr_mono.xtime_nsec;
vdata->wall_time_sec = tk->xtime_sec;
+ vdata->boot_ns_raw = boot_ns_raw;
+ vdata->monotonic_raw_nsec = tk->tkr_raw.xtime_nsec;
+
write_seqcount_end(&vdata->seq);
}
#endif
@@ -2015,21 +2015,21 @@ static u64 read_tsc(void)
return last;
}
-static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
+static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
+ int *mode)
{
long v;
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
u64 tsc_pg_val;
- switch (gtod->clock.vclock_mode) {
+ switch (clock->vclock_mode) {
case VCLOCK_HVCLOCK:
tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
tsc_timestamp);
if (tsc_pg_val != U64_MAX) {
/* TSC page valid */
*mode = VCLOCK_HVCLOCK;
- v = (tsc_pg_val - gtod->clock.cycle_last) &
- gtod->clock.mask;
+ v = (tsc_pg_val - clock->cycle_last) &
+ clock->mask;
} else {
/* TSC page invalid */
*mode = VCLOCK_NONE;
@@ -2038,8 +2038,8 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
case VCLOCK_TSC:
*mode = VCLOCK_TSC;
*tsc_timestamp = read_tsc();
- v = (*tsc_timestamp - gtod->clock.cycle_last) &
- gtod->clock.mask;
+ v = (*tsc_timestamp - clock->cycle_last) &
+ clock->mask;
break;
default:
*mode = VCLOCK_NONE;
@@ -2048,10 +2048,10 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
if (*mode == VCLOCK_NONE)
*tsc_timestamp = v = 0;
- return v * gtod->clock.mult;
+ return v * clock->mult;
}
-static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
+static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -2060,10 +2060,10 @@ static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
do {
seq = read_seqcount_begin(&gtod->seq);
- ns = gtod->nsec_base;
- ns += vgettsc(tsc_timestamp, &mode);
+ ns = gtod->monotonic_raw_nsec;
+ ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
- ns += gtod->boot_ns;
+ ns += gtod->boot_ns_raw;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
*t = ns;
@@ -2081,7 +2081,7 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
seq = read_seqcount_begin(&gtod->seq);
ts->tv_sec = gtod->wall_time_sec;
ns = gtod->nsec_base;
- ns += vgettsc(tsc_timestamp, &mode);
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -2098,7 +2098,7 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+ return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
tsc_timestamp));
}
@@ -2721,6 +2721,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ /*
+ * We do support PT if kvm_x86_ops->pt_supported(), but we do
+ * not support IA32_XSS[bit 8]. Guests will have to use
+ * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT
+ * MSRs.
+ */
+ if (data != 0)
+ return 1;
+ vcpu->arch.ia32_xss = data;
+ break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
return 1;
@@ -3048,6 +3062,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
msr_info->host_initiated);
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ msr_info->data = vcpu->arch.ia32_xss;
+ break;
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
@@ -3825,12 +3845,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- if (lapic_in_kernel(vcpu)) {
- if (events->smi.latched_init)
- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
- else
- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
- }
+ }
+
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
}
@@ -4421,6 +4442,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_SET_NESTED_STATE: {
struct kvm_nested_state __user *user_kvm_nested_state = argp;
struct kvm_nested_state kvm_state;
+ int idx;
r = -EINVAL;
if (!kvm_x86_ops->set_nested_state)
@@ -4444,7 +4466,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
&& !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
break;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_GET_SUPPORTED_HV_CPUID: {
@@ -4946,9 +4970,6 @@ set_identity_unlock:
if (!irqchip_kernel(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
- if (r)
- goto set_irqchip_out;
- r = 0;
set_irqchip_out:
kfree(chip);
break;
@@ -6136,7 +6157,7 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
- return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
+ return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -7866,6 +7887,19 @@ static void process_smi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+ unsigned long *vcpu_bitmap)
+{
+ cpumask_var_t cpus;
+
+ zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
+ vcpu_bitmap, cpus);
+
+ free_cpumask_var(cpus);
+}
+
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@ -7943,7 +7977,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
*/
put_page(page);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
{
@@ -8702,8 +8735,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
goto out;
- /* INITs are latched while in SMM */
- if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+ /*
+ * KVM_MP_STATE_INIT_RECEIVED means the processor is in
+ * INIT state; latched init should be reported using
+ * KVM_SET_VCPU_EVENTS, so reject it here.
+ */
+ if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
goto out;
@@ -8795,7 +8832,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -9322,6 +9359,9 @@ int kvm_arch_hardware_setup(void)
kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
}
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
kvm_init_msr_list();
return 0;
}
@@ -9375,7 +9415,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_free_pio_data;
if (irqchip_in_kernel(vcpu->kvm)) {
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
+ vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm);
r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
@@ -9444,7 +9484,13 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
vcpu->arch.l1tf_flush_l1d = true;
+ if (pmu->version && unlikely(pmu->event_count)) {
+ pmu->need_cleanup = true;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
kvm_x86_ops->sched_in(vcpu, cpu);
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index dbf7442a822b..29391af8871d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -238,8 +238,7 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
return false;
}
-static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
+static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, int reg)
{
unsigned long val = kvm_register_read(vcpu, reg);
@@ -247,8 +246,7 @@ static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu,
}
static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
- enum kvm_reg reg,
- unsigned long val)
+ int reg, unsigned long val)
{
if (!is_64_bit_mode(vcpu))
val = (u32)val;
@@ -260,6 +258,11 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
return !(kvm->arch.disabled_quirks & quirk);
}
+static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
+{
+ return is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu);
+}
+
void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
@@ -366,7 +369,7 @@ static inline bool kvm_pat_valid(u64 data)
return (data | ((data & 0x0202020202020202ull) << 1)) == data;
}
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
#endif
diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index 6b17d179ef8a..39fdd0641637 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -21,6 +21,8 @@
#include <linux/ccp.h>
#include <linux/firmware.h>
+#include <asm/smp.h>
+
#include "sp-dev.h"
#include "psp-dev.h"
@@ -235,6 +237,13 @@ static int __sev_platform_init_locked(int *error)
return rc;
psp->sev_state = SEV_STATE_INIT;
+
+ /* Prepare for first SEV guest launch after INIT */
+ wbinvd_on_all_cpus();
+ rc = __sev_do_cmd_locked(SEV_CMD_DF_FLUSH, NULL, error);
+ if (rc)
+ return rc;
+
dev_dbg(psp->dev, "SEV firmware initialized\n");
return rc;
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 563e87ed0766..45969927cc81 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -141,12 +141,17 @@ static int its_send_vpe_cmd(struct its_vpe *vpe, struct its_cmd_info *info)
int its_schedule_vpe(struct its_vpe *vpe, bool on)
{
struct its_cmd_info info;
+ int ret;
WARN_ON(preemptible());
info.cmd_type = on ? SCHEDULE_VPE : DESCHEDULE_VPE;
- return its_send_vpe_cmd(vpe, &info);
+ ret = its_send_vpe_cmd(vpe, &info);
+ if (!ret)
+ vpe->resident = on;
+
+ return ret;
}
int its_invall_vpe(struct its_vpe *vpe)
diff --git a/include/Kbuild b/include/Kbuild
index 95508049ee51..25edc43483e0 100644
--- a/include/Kbuild
+++ b/include/Kbuild
@@ -66,6 +66,8 @@ header-test- += keys/asymmetric-type.h
header-test- += keys/big_key-type.h
header-test- += keys/request_key_auth-type.h
header-test- += kvm/arm_arch_timer.h
+header-test-$(CONFIG_ARM) += kvm/arm_hypercalls.h
+header-test-$(CONFIG_ARM64) += kvm/arm_hypercalls.h
header-test- += kvm/arm_pmu.h
header-test-$(CONFIG_ARM) += kvm/arm_psci.h
header-test-$(CONFIG_ARM64) += kvm/arm_psci.h
diff --git a/include/kvm/arm_hypercalls.h b/include/kvm/arm_hypercalls.h
new file mode 100644
index 000000000000..0e2509d27910
--- /dev/null
+++ b/include/kvm/arm_hypercalls.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019 Arm Ltd. */
+
+#ifndef __KVM_ARM_HYPERCALLS_H
+#define __KVM_ARM_HYPERCALLS_H
+
+#include <asm/kvm_emulate.h>
+
+int kvm_hvc_call_handler(struct kvm_vcpu *vcpu);
+
+static inline u32 smccc_get_function(struct kvm_vcpu *vcpu)
+{
+ return vcpu_get_reg(vcpu, 0);
+}
+
+static inline unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu)
+{
+ return vcpu_get_reg(vcpu, 1);
+}
+
+static inline unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu)
+{
+ return vcpu_get_reg(vcpu, 2);
+}
+
+static inline unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu)
+{
+ return vcpu_get_reg(vcpu, 3);
+}
+
+static inline void smccc_set_retval(struct kvm_vcpu *vcpu,
+ unsigned long a0,
+ unsigned long a1,
+ unsigned long a2,
+ unsigned long a3)
+{
+ vcpu_set_reg(vcpu, 0, a0);
+ vcpu_set_reg(vcpu, 1, a1);
+ vcpu_set_reg(vcpu, 2, a2);
+ vcpu_set_reg(vcpu, 3, a3);
+}
+
+#endif
diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h
index 632e78bdef4d..5b58bd2fe088 100644
--- a/include/kvm/arm_psci.h
+++ b/include/kvm/arm_psci.h
@@ -40,7 +40,7 @@ static inline int kvm_psci_version(struct kvm_vcpu *vcpu, struct kvm *kvm)
}
-int kvm_hvc_call_handler(struct kvm_vcpu *vcpu);
+int kvm_psci_call(struct kvm_vcpu *vcpu);
struct kvm_one_reg;
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index af4f09c02bf1..9d53f545a3d5 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -240,7 +240,7 @@ struct vgic_dist {
* Contains the attributes and gpa of the LPI configuration table.
* Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share
* one address across all redistributors.
- * GICv3 spec: 6.1.2 "LPI Configuration tables"
+ * GICv3 spec: IHI 0069E 6.1.1 "LPI Configuration tables"
*/
u64 propbaser;
@@ -378,8 +378,6 @@ static inline int kvm_vgic_get_max_vcpus(void)
return kvm_vgic_global_state.max_gic_vcpus;
}
-int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
-
/**
* kvm_vgic_setup_default_irq_routing:
* Setup a default flat gsi routing table mapping all SPIs
@@ -396,7 +394,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq,
int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq,
struct kvm_kernel_irq_routing_entry *irq_entry);
-void kvm_vgic_v4_enable_doorbell(struct kvm_vcpu *vcpu);
-void kvm_vgic_v4_disable_doorbell(struct kvm_vcpu *vcpu);
+int vgic_v4_load(struct kvm_vcpu *vcpu);
+int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db);
#endif /* __KVM_ARM_VGIC_H */
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index df01a8579034..59494df0f55b 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -45,6 +45,7 @@
#define ARM_SMCCC_OWNER_SIP 2
#define ARM_SMCCC_OWNER_OEM 3
#define ARM_SMCCC_OWNER_STANDARD 4
+#define ARM_SMCCC_OWNER_STANDARD_HYP 5
#define ARM_SMCCC_OWNER_TRUSTED_APP 48
#define ARM_SMCCC_OWNER_TRUSTED_APP_END 49
#define ARM_SMCCC_OWNER_TRUSTED_OS 50
@@ -318,5 +319,63 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
#define SMCCC_RET_NOT_SUPPORTED -1
#define SMCCC_RET_NOT_REQUIRED -2
+/*
+ * Like arm_smccc_1_1* but always returns SMCCC_RET_NOT_SUPPORTED.
+ * Used when the SMCCC conduit is not defined. The empty asm statement
+ * avoids compiler warnings about unused variables.
+ */
+#define __fail_smccc_1_1(...) \
+ do { \
+ __declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \
+ asm ("" __constraints(__count_args(__VA_ARGS__))); \
+ if (___res) \
+ ___res->a0 = SMCCC_RET_NOT_SUPPORTED; \
+ } while (0)
+
+/*
+ * arm_smccc_1_1_invoke() - make an SMCCC v1.1 compliant call
+ *
+ * This is a variadic macro taking one to eight source arguments, and
+ * an optional return structure.
+ *
+ * @a0-a7: arguments passed in registers 0 to 7
+ * @res: result values from registers 0 to 3
+ *
+ * This macro will make either an HVC call or an SMC call depending on the
+ * current SMCCC conduit. If no valid conduit is available then -1
+ * (SMCCC_RET_NOT_SUPPORTED) is returned in @res.a0 (if supplied).
+ *
+ * The return value also provides the conduit that was used.
+ */
+#define arm_smccc_1_1_invoke(...) ({ \
+ int method = arm_smccc_1_1_get_conduit(); \
+ switch (method) { \
+ case SMCCC_CONDUIT_HVC: \
+ arm_smccc_1_1_hvc(__VA_ARGS__); \
+ break; \
+ case SMCCC_CONDUIT_SMC: \
+ arm_smccc_1_1_smc(__VA_ARGS__); \
+ break; \
+ default: \
+ __fail_smccc_1_1(__VA_ARGS__); \
+ method = SMCCC_CONDUIT_NONE; \
+ break; \
+ } \
+ method; \
+ })
+
+/* Paravirtualised time calls (defined by ARM DEN0057A) */
+#define ARM_SMCCC_HV_PV_TIME_FEATURES \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ ARM_SMCCC_OWNER_STANDARD_HYP, \
+ 0x20)
+
+#define ARM_SMCCC_HV_PV_TIME_ST \
+ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ ARM_SMCCC_OWNER_STANDARD_HYP, \
+ 0x21)
+
#endif /*__ASSEMBLY__*/
#endif /*__LINUX_ARM_SMCCC_H*/
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 068793a619ca..89d75edb5750 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -136,6 +136,7 @@ enum cpuhp_state {
/* Must be the last timer callback */
CPUHP_AP_DUMMY_TIMER_STARTING,
CPUHP_AP_ARM_XEN_STARTING,
+ CPUHP_AP_ARM_KVMPV_STARTING,
CPUHP_AP_ARM_CORESIGHT_STARTING,
CPUHP_AP_ARM64_ISNDEP_STARTING,
CPUHP_AP_SMPCFD_DYING,
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index e6b155713b47..5dbcfc65f21e 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -32,9 +32,13 @@ struct its_vm {
struct its_vpe {
struct page *vpt_page;
struct its_vm *its_vm;
+ /* per-vPE VLPI tracking */
+ atomic_t vlpi_count;
/* Doorbell interrupt */
int irq;
irq_hw_number_t vpe_db_lpi;
+ /* VPE resident */
+ bool resident;
/* VPE proxy mapping */
int vpe_proxy_event;
/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d41c521a39da..7ed1e2f8641e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -266,7 +266,8 @@ struct kvm_vcpu {
struct preempt_notifier preempt_notifier;
#endif
int cpu;
- int vcpu_id;
+ int vcpu_id; /* id given by userspace at creation */
+ int vcpu_idx; /* index in kvm->vcpus array */
int srcu_idx;
int mode;
u64 requests;
@@ -278,7 +279,6 @@ struct kvm_vcpu {
struct mutex mutex;
struct kvm_run *run;
- int guest_xcr0_loaded;
struct swait_queue_head wq;
struct pid __rcu *pid;
int sigset_active;
@@ -571,13 +571,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *tmp;
- int idx;
-
- kvm_for_each_vcpu(idx, tmp, vcpu->kvm)
- if (tmp == vcpu)
- return idx;
- BUG();
+ return vcpu->vcpu_idx;
}
#define kvm_for_each_memslot(memslot, slots) \
@@ -622,6 +616,7 @@ void kvm_exit(void);
void kvm_get_kvm(struct kvm *kvm);
void kvm_put_kvm(struct kvm *kvm);
+void kvm_put_kvm_no_destroy(struct kvm *kvm);
static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
{
@@ -746,6 +741,28 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
unsigned long len);
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
gpa_t gpa, unsigned long len);
+
+#define __kvm_put_guest(kvm, gfn, offset, value, type) \
+({ \
+ unsigned long __addr = gfn_to_hva(kvm, gfn); \
+ type __user *__uaddr = (type __user *)(__addr + offset); \
+ int __ret = -EFAULT; \
+ \
+ if (!kvm_is_error_hva(__addr)) \
+ __ret = put_user(value, __uaddr); \
+ if (!__ret) \
+ mark_page_dirty(kvm, gfn); \
+ __ret; \
+})
+
+#define kvm_put_guest(kvm, gpa, value, type) \
+({ \
+ gpa_t __gpa = gpa; \
+ struct kvm *__kvm = kvm; \
+ __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT, \
+ offset_in_page(__gpa), (value), type); \
+})
+
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
@@ -791,6 +808,8 @@ void kvm_reload_remote_mmus(struct kvm *kvm);
bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
unsigned long *vcpu_bitmap, cpumask_var_t tmp);
bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
+bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req,
+ unsigned long *vcpu_bitmap);
long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
@@ -1241,7 +1260,7 @@ extern unsigned int halt_poll_ns_grow_start;
extern unsigned int halt_poll_ns_shrink;
struct kvm_device {
- struct kvm_device_ops *ops;
+ const struct kvm_device_ops *ops;
struct kvm *kvm;
void *private;
struct list_head vm_node;
@@ -1294,7 +1313,7 @@ struct kvm_device_ops {
void kvm_device_get(struct kvm_device *dev);
void kvm_device_put(struct kvm_device *dev);
struct kvm_device *kvm_device_from_filp(struct file *filp);
-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type);
+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type);
void kvm_unregister_device_ops(u32 type);
extern struct kvm_device_ops kvm_mpic_ops;
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index bde5374ae021..1c88e69db3d9 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -35,6 +35,8 @@ typedef unsigned long gva_t;
typedef u64 gpa_t;
typedef u64 gfn_t;
+#define GPA_INVALID (~(gpa_t)0)
+
typedef unsigned long hva_t;
typedef u64 hpa_t;
typedef u64 hfn_t;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 68ccc5b1913b..a07bfdb7d8ea 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1336,6 +1336,8 @@ extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
+extern int perf_event_period(struct perf_event *event, u64 value);
+extern u64 perf_event_pause(struct perf_event *event, bool reset);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1415,6 +1417,14 @@ static inline void perf_event_disable(struct perf_event *event) { }
static inline int __perf_event_disable(void *info) { return -1; }
static inline void perf_event_task_tick(void) { }
static inline int perf_event_release_kernel(struct perf_event *event) { return 0; }
+static inline int perf_event_period(struct perf_event *event, u64 value)
+{
+ return -EINVAL;
+}
+static inline u64 perf_event_pause(struct perf_event *event, bool reset)
+{
+ return 0;
+}
#endif
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 52641d8ca9e8..e6f17c8e2dba 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -235,6 +235,7 @@ struct kvm_hyperv_exit {
#define KVM_EXIT_S390_STSI 25
#define KVM_EXIT_IOAPIC_EOI 26
#define KVM_EXIT_HYPERV 27
+#define KVM_EXIT_ARM_NISV 28
/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -394,6 +395,11 @@ struct kvm_run {
} eoi;
/* KVM_EXIT_HYPERV */
struct kvm_hyperv_exit hyperv;
+ /* KVM_EXIT_ARM_NISV */
+ struct {
+ __u64 esr_iss;
+ __u64 fault_ipa;
+ } arm_nisv;
/* Fix the size of the union. */
char padding[256];
};
@@ -1000,6 +1006,9 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_PMU_EVENT_FILTER 173
#define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
#define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
+#define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176
+#define KVM_CAP_ARM_NISV_TO_USER 177
+#define KVM_CAP_ARM_INJECT_EXT_DABT 178
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1227,6 +1236,8 @@ enum kvm_device_type {
#define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS
KVM_DEV_TYPE_XIVE,
#define KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_XIVE
+ KVM_DEV_TYPE_ARM_PV_TIME,
+#define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME
KVM_DEV_TYPE_MAX,
};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 00a014670ed0..2cba42957b35 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5029,6 +5029,24 @@ static void _perf_event_reset(struct perf_event *event)
perf_event_update_userpage(event);
}
+/* Assume it's not an event with inherit set. */
+u64 perf_event_pause(struct perf_event *event, bool reset)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(event->attr.inherit);
+ _perf_event_disable(event);
+ count = local64_read(&event->count);
+ if (reset)
+ local64_set(&event->count, 0);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
+EXPORT_SYMBOL_GPL(perf_event_pause);
+
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
@@ -5106,16 +5124,11 @@ static int perf_event_check_period(struct perf_event *event, u64 value)
return event->pmu->check_period(event, value);
}
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
+static int _perf_event_period(struct perf_event *event, u64 value)
{
- u64 value;
-
if (!is_sampling_event(event))
return -EINVAL;
- if (copy_from_user(&value, arg, sizeof(value)))
- return -EFAULT;
-
if (!value)
return -EINVAL;
@@ -5133,6 +5146,19 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
return 0;
}
+int perf_event_period(struct perf_event *event, u64 value)
+{
+ struct perf_event_context *ctx;
+ int ret;
+
+ ctx = perf_event_ctx_lock(event);
+ ret = _perf_event_period(event, value);
+ perf_event_ctx_unlock(event, ctx);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(perf_event_period);
+
static const struct file_operations perf_fops;
static inline int perf_fget_light(int fd, struct fd *p)
@@ -5176,8 +5202,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
return _perf_event_refresh(event, arg);
case PERF_EVENT_IOC_PERIOD:
- return perf_event_period(event, (u64 __user *)arg);
+ {
+ u64 value;
+ if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
+ return -EFAULT;
+
+ return _perf_event_period(event, value);
+ }
case PERF_EVENT_IOC_ID:
{
u64 id = primary_event_id(event);
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 409c1fa75e03..30072c3f52fb 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -13,6 +13,7 @@
/x86_64/vmx_dirty_log_test
/x86_64/vmx_set_nested_state_test
/x86_64/vmx_tsc_adjust_test
+/x86_64/xss_msr_test
/clear_dirty_log_test
/dirty_log_test
/kvm_create_max_vcpus
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index c5ec868fa1e5..3138a916574a 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -25,6 +25,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
TEST_GEN_PROGS_x86_64 += clear_dirty_log_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index ff234018219c..635ee6c33ad2 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -308,6 +308,8 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_x86_state *state);
+struct kvm_msr_list *kvm_get_msr_index_list(void);
+
struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_cpuid2 *cpuid);
@@ -322,10 +324,13 @@ kvm_get_supported_cpuid_entry(uint32_t function)
}
uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index);
+int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value);
void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
uint64_t msr_value);
-uint32_t kvm_get_cpuid_max(void);
+uint32_t kvm_get_cpuid_max_basic(void);
+uint32_t kvm_get_cpuid_max_extended(void);
void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
/*
diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
index 231d79e57774..6f38c3dc0d56 100644
--- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
+++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
@@ -29,12 +29,9 @@ void test_vcpu_creation(int first_vcpu_id, int num_vcpus)
vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
- for (i = 0; i < num_vcpus; i++) {
- int vcpu_id = first_vcpu_id + i;
-
+ for (i = first_vcpu_id; i < first_vcpu_id + num_vcpus; i++)
/* This asserts that the vCPU was created. */
- vm_vcpu_add(vm, vcpu_id);
- }
+ vm_vcpu_add(vm, i);
kvm_vm_free(vm);
}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 6698cb741e10..683d3bdb8f6a 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -869,7 +869,7 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
return buffer.entry.data;
}
-/* VCPU Set MSR
+/* _VCPU Set MSR
*
* Input Args:
* vm - Virtual Machine
@@ -879,12 +879,12 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
*
* Output Args: None
*
- * Return: On success, nothing. On failure a TEST_ASSERT is produced.
+ * Return: The result of KVM_SET_MSRS.
*
- * Set value of MSR for VCPU.
+ * Sets the value of an MSR for the given VCPU.
*/
-void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
- uint64_t msr_value)
+int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value)
{
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
struct {
@@ -899,6 +899,29 @@ void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
buffer.entry.index = msr_index;
buffer.entry.data = msr_value;
r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header);
+ return r;
+}
+
+/* VCPU Set MSR
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * msr_index - Index of MSR
+ * msr_value - New value of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, nothing. On failure a TEST_ASSERT is produced.
+ *
+ * Set value of MSR for VCPU.
+ */
+void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value)
+{
+ int r;
+
+ r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value);
TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n"
" rc: %i errno: %i", r, errno);
}
@@ -1000,19 +1023,45 @@ struct kvm_x86_state {
struct kvm_msrs msrs;
};
-static int kvm_get_num_msrs(struct kvm_vm *vm)
+static int kvm_get_num_msrs_fd(int kvm_fd)
{
struct kvm_msr_list nmsrs;
int r;
nmsrs.nmsrs = 0;
- r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
+ r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i",
r);
return nmsrs.nmsrs;
}
+static int kvm_get_num_msrs(struct kvm_vm *vm)
+{
+ return kvm_get_num_msrs_fd(vm->kvm_fd);
+}
+
+struct kvm_msr_list *kvm_get_msr_index_list(void)
+{
+ struct kvm_msr_list *list;
+ int nmsrs, r, kvm_fd;
+
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ nmsrs = kvm_get_num_msrs_fd(kvm_fd);
+ list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
+ list->nmsrs = nmsrs;
+ r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
+ close(kvm_fd);
+
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
+ r);
+
+ return list;
+}
+
struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
{
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
@@ -1158,7 +1207,12 @@ bool is_intel_cpu(void)
return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
}
-uint32_t kvm_get_cpuid_max(void)
+uint32_t kvm_get_cpuid_max_basic(void)
+{
+ return kvm_get_supported_cpuid_entry(0)->eax;
+}
+
+uint32_t kvm_get_cpuid_max_extended(void)
{
return kvm_get_supported_cpuid_entry(0x80000000)->eax;
}
@@ -1169,7 +1223,7 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
bool pae;
/* SDM 4.1.4 */
- if (kvm_get_cpuid_max() < 0x80000008) {
+ if (kvm_get_cpuid_max_extended() < 0x80000008) {
pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6);
*pa_bits = pae ? 36 : 32;
*va_bits = 32;
diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c
index d5290b4ad636..b705637ca14b 100644
--- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c
+++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c
@@ -25,12 +25,15 @@
static void guest_code(void)
{
- register u64 stage asm("11") = 0;
-
- for (;;) {
- GUEST_SYNC(0);
- asm volatile ("ahi %0,1" : : "r"(stage));
- }
+ /*
+ * We embed diag 501 here instead of doing a ucall to avoid that
+ * the compiler has messed with r11 at the time of the ucall.
+ */
+ asm volatile (
+ "0: diag 0,0,0x501\n"
+ " ahi 11,1\n"
+ " j 0b\n"
+ );
}
#define REG_COMPARE(reg) \
diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
new file mode 100644
index 000000000000..851ea81b9d9f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019, Google LLC.
+ *
+ * Tests for the IA32_XSS MSR.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define VCPU_ID 1
+#define MSR_BITS 64
+
+#define X86_FEATURE_XSAVES (1<<3)
+
+bool is_supported_msr(u32 msr_index)
+{
+ struct kvm_msr_list *list;
+ bool found = false;
+ int i;
+
+ list = kvm_get_msr_index_list();
+ for (i = 0; i < list->nmsrs; ++i) {
+ if (list->indices[i] == msr_index) {
+ found = true;
+ break;
+ }
+ }
+
+ free(list);
+ return found;
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_cpuid_entry2 *entry;
+ bool xss_supported = false;
+ struct kvm_vm *vm;
+ uint64_t xss_val;
+ int i, r;
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, 0);
+
+ if (kvm_get_cpuid_max_basic() >= 0xd) {
+ entry = kvm_get_supported_cpuid_index(0xd, 1);
+ xss_supported = entry && !!(entry->eax & X86_FEATURE_XSAVES);
+ }
+ if (!xss_supported) {
+ printf("IA32_XSS is not supported by the vCPU.\n");
+ exit(KSFT_SKIP);
+ }
+
+ xss_val = vcpu_get_msr(vm, VCPU_ID, MSR_IA32_XSS);
+ TEST_ASSERT(xss_val == 0,
+ "MSR_IA32_XSS should be initialized to zero\n");
+
+ vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, xss_val);
+ /*
+ * At present, KVM only supports a guest IA32_XSS value of 0. Verify
+ * that trying to set the guest IA32_XSS to an unsupported value fails.
+ * Also, in the future when a non-zero value succeeds check that
+ * IA32_XSS is in the KVM_GET_MSR_INDEX_LIST.
+ */
+ for (i = 0; i < MSR_BITS; ++i) {
+ r = _vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, 1ull << i);
+ TEST_ASSERT(r == 0 || is_supported_msr(MSR_IA32_XSS),
+ "IA32_XSS was able to be set, but was not found in KVM_GET_MSR_INDEX_LIST.\n");
+ }
+
+ kvm_vm_free(vm);
+}
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index e2bb5bd60227..f182b2380345 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -80,7 +80,7 @@ static inline bool userspace_irqchip(struct kvm *kvm)
static void soft_timer_start(struct hrtimer *hrt, u64 ns)
{
hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_HARD);
}
static void soft_timer_cancel(struct hrtimer *hrt)
@@ -697,11 +697,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
ptimer->cntvoff = 0;
- hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
timer->bg_timer.function = kvm_bg_timer_expire;
- hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
vtimer->hrtimer.function = kvm_hrtimer_expire;
ptimer->hrtimer.function = kvm_hrtimer_expire;
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 86c6aa1cb58e..12e0280291ce 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -40,6 +40,10 @@
#include <asm/kvm_coproc.h>
#include <asm/sections.h>
+#include <kvm/arm_hypercalls.h>
+#include <kvm/arm_pmu.h>
+#include <kvm/arm_psci.h>
+
#ifdef REQUIRES_VIRT
__asm__(".arch_extension virt");
#endif
@@ -98,6 +102,26 @@ int kvm_arch_check_processor_compat(void)
return 0;
}
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_ARM_NISV_TO_USER:
+ r = 0;
+ kvm->arch.return_nisv_io_abort_to_user = true;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
/**
* kvm_arch_init_vm - initializes a VM data structure
@@ -197,6 +221,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_VCPU_EVENTS:
case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
+ case KVM_CAP_ARM_NISV_TO_USER:
+ case KVM_CAP_ARM_INJECT_EXT_DABT:
r = 1;
break;
case KVM_CAP_ARM_SET_DEVICE_ADDR:
@@ -322,20 +348,24 @@ void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
/*
* If we're about to block (most likely because we've just hit a
* WFI), we need to sync back the state of the GIC CPU interface
- * so that we have the lastest PMR and group enables. This ensures
+ * so that we have the latest PMR and group enables. This ensures
* that kvm_arch_vcpu_runnable has up-to-date data to decide
* whether we have pending interrupts.
+ *
+ * For the same reason, we want to tell GICv4 that we need
+ * doorbells to be signalled, should an interrupt become pending.
*/
preempt_disable();
kvm_vgic_vmcr_sync(vcpu);
+ vgic_v4_put(vcpu, true);
preempt_enable();
-
- kvm_vgic_v4_enable_doorbell(vcpu);
}
void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- kvm_vgic_v4_disable_doorbell(vcpu);
+ preempt_disable();
+ vgic_v4_load(vcpu);
+ preempt_enable();
}
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
@@ -351,6 +381,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm_arm_reset_debug_ptr(vcpu);
+ kvm_arm_pvtime_vcpu_init(&vcpu->arch);
+
return kvm_vgic_vcpu_init(vcpu);
}
@@ -380,11 +412,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_vcpu_load_sysregs(vcpu);
kvm_arch_vcpu_load_fp(vcpu);
kvm_vcpu_pmu_restore_guest(vcpu);
+ if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
+ kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
if (single_task_running())
- vcpu_clear_wfe_traps(vcpu);
+ vcpu_clear_wfx_traps(vcpu);
else
- vcpu_set_wfe_traps(vcpu);
+ vcpu_set_wfx_traps(vcpu);
vcpu_ptrauth_setup_lazy(vcpu);
}
@@ -645,6 +679,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
* that a VCPU sees new virtual interrupts.
*/
kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
+
+ if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
+ kvm_update_stolen_time(vcpu);
}
}
diff --git a/virt/kvm/arm/hypercalls.c b/virt/kvm/arm/hypercalls.c
new file mode 100644
index 000000000000..550dfa3e53cd
--- /dev/null
+++ b/virt/kvm/arm/hypercalls.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Arm Ltd.
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+#include <kvm/arm_hypercalls.h>
+#include <kvm/arm_psci.h>
+
+int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
+{
+ u32 func_id = smccc_get_function(vcpu);
+ long val = SMCCC_RET_NOT_SUPPORTED;
+ u32 feature;
+ gpa_t gpa;
+
+ switch (func_id) {
+ case ARM_SMCCC_VERSION_FUNC_ID:
+ val = ARM_SMCCC_VERSION_1_1;
+ break;
+ case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
+ feature = smccc_get_arg1(vcpu);
+ switch (feature) {
+ case ARM_SMCCC_ARCH_WORKAROUND_1:
+ switch (kvm_arm_harden_branch_predictor()) {
+ case KVM_BP_HARDEN_UNKNOWN:
+ break;
+ case KVM_BP_HARDEN_WA_NEEDED:
+ val = SMCCC_RET_SUCCESS;
+ break;
+ case KVM_BP_HARDEN_NOT_REQUIRED:
+ val = SMCCC_RET_NOT_REQUIRED;
+ break;
+ }
+ break;
+ case ARM_SMCCC_ARCH_WORKAROUND_2:
+ switch (kvm_arm_have_ssbd()) {
+ case KVM_SSBD_FORCE_DISABLE:
+ case KVM_SSBD_UNKNOWN:
+ break;
+ case KVM_SSBD_KERNEL:
+ val = SMCCC_RET_SUCCESS;
+ break;
+ case KVM_SSBD_FORCE_ENABLE:
+ case KVM_SSBD_MITIGATED:
+ val = SMCCC_RET_NOT_REQUIRED;
+ break;
+ }
+ break;
+ case ARM_SMCCC_HV_PV_TIME_FEATURES:
+ val = SMCCC_RET_SUCCESS;
+ break;
+ }
+ break;
+ case ARM_SMCCC_HV_PV_TIME_FEATURES:
+ val = kvm_hypercall_pv_features(vcpu);
+ break;
+ case ARM_SMCCC_HV_PV_TIME_ST:
+ gpa = kvm_init_stolen_time(vcpu);
+ if (gpa != GPA_INVALID)
+ val = gpa;
+ break;
+ default:
+ return kvm_psci_call(vcpu);
+ }
+
+ smccc_set_retval(vcpu, val, 0, 0, 0);
+ return 1;
+}
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index 6af5c91337f2..70d3b449692c 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -167,7 +167,14 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
if (ret)
return ret;
} else {
- kvm_err("load/store instruction decoding not implemented\n");
+ if (vcpu->kvm->arch.return_nisv_io_abort_to_user) {
+ run->exit_reason = KVM_EXIT_ARM_NISV;
+ run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu);
+ run->arm_nisv.fault_ipa = fault_ipa;
+ return 0;
+ }
+
+ kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n");
return -ENOSYS;
}
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index 87927f7e1ee7..17e2bdd4b76f 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -15,6 +15,7 @@
#include <asm/kvm_host.h>
#include <kvm/arm_psci.h>
+#include <kvm/arm_hypercalls.h>
/*
* This is an implementation of the Power State Coordination Interface
@@ -23,38 +24,6 @@
#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
-static u32 smccc_get_function(struct kvm_vcpu *vcpu)
-{
- return vcpu_get_reg(vcpu, 0);
-}
-
-static unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu)
-{
- return vcpu_get_reg(vcpu, 1);
-}
-
-static unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu)
-{
- return vcpu_get_reg(vcpu, 2);
-}
-
-static unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu)
-{
- return vcpu_get_reg(vcpu, 3);
-}
-
-static void smccc_set_retval(struct kvm_vcpu *vcpu,
- unsigned long a0,
- unsigned long a1,
- unsigned long a2,
- unsigned long a3)
-{
- vcpu_set_reg(vcpu, 0, a0);
- vcpu_set_reg(vcpu, 1, a1);
- vcpu_set_reg(vcpu, 2, a2);
- vcpu_set_reg(vcpu, 3, a3);
-}
-
static unsigned long psci_affinity_mask(unsigned long affinity_level)
{
if (affinity_level <= 3)
@@ -373,7 +342,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
* Errors:
* -EINVAL: Unrecognized PSCI function
*/
-static int kvm_psci_call(struct kvm_vcpu *vcpu)
+int kvm_psci_call(struct kvm_vcpu *vcpu)
{
switch (kvm_psci_version(vcpu, vcpu->kvm)) {
case KVM_ARM_PSCI_1_0:
@@ -387,55 +356,6 @@ static int kvm_psci_call(struct kvm_vcpu *vcpu)
};
}
-int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
-{
- u32 func_id = smccc_get_function(vcpu);
- u32 val = SMCCC_RET_NOT_SUPPORTED;
- u32 feature;
-
- switch (func_id) {
- case ARM_SMCCC_VERSION_FUNC_ID:
- val = ARM_SMCCC_VERSION_1_1;
- break;
- case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
- feature = smccc_get_arg1(vcpu);
- switch(feature) {
- case ARM_SMCCC_ARCH_WORKAROUND_1:
- switch (kvm_arm_harden_branch_predictor()) {
- case KVM_BP_HARDEN_UNKNOWN:
- break;
- case KVM_BP_HARDEN_WA_NEEDED:
- val = SMCCC_RET_SUCCESS;
- break;
- case KVM_BP_HARDEN_NOT_REQUIRED:
- val = SMCCC_RET_NOT_REQUIRED;
- break;
- }
- break;
- case ARM_SMCCC_ARCH_WORKAROUND_2:
- switch (kvm_arm_have_ssbd()) {
- case KVM_SSBD_FORCE_DISABLE:
- case KVM_SSBD_UNKNOWN:
- break;
- case KVM_SSBD_KERNEL:
- val = SMCCC_RET_SUCCESS;
- break;
- case KVM_SSBD_FORCE_ENABLE:
- case KVM_SSBD_MITIGATED:
- val = SMCCC_RET_NOT_REQUIRED;
- break;
- }
- break;
- }
- break;
- default:
- return kvm_psci_call(vcpu);
- }
-
- smccc_set_retval(vcpu, val, 0, 0, 0);
- return 1;
-}
-
int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
{
return 3; /* PSCI version and two workaround registers */
diff --git a/virt/kvm/arm/pvtime.c b/virt/kvm/arm/pvtime.c
new file mode 100644
index 000000000000..1e0f4c284888
--- /dev/null
+++ b/virt/kvm/arm/pvtime.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Arm Ltd.
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_mmu.h>
+#include <asm/pvclock-abi.h>
+
+#include <kvm/arm_hypercalls.h>
+
+void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 steal;
+ __le64 steal_le;
+ u64 offset;
+ int idx;
+ u64 base = vcpu->arch.steal.base;
+
+ if (base == GPA_INVALID)
+ return;
+
+ /* Let's do the local bookkeeping */
+ steal = vcpu->arch.steal.steal;
+ steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal;
+ vcpu->arch.steal.last_steal = current->sched_info.run_delay;
+ vcpu->arch.steal.steal = steal;
+
+ steal_le = cpu_to_le64(steal);
+ idx = srcu_read_lock(&kvm->srcu);
+ offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
+ kvm_put_guest(kvm, base + offset, steal_le, u64);
+ srcu_read_unlock(&kvm->srcu, idx);
+}
+
+long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
+{
+ u32 feature = smccc_get_arg1(vcpu);
+ long val = SMCCC_RET_NOT_SUPPORTED;
+
+ switch (feature) {
+ case ARM_SMCCC_HV_PV_TIME_FEATURES:
+ case ARM_SMCCC_HV_PV_TIME_ST:
+ val = SMCCC_RET_SUCCESS;
+ break;
+ }
+
+ return val;
+}
+
+gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
+{
+ struct pvclock_vcpu_stolen_time init_values = {};
+ struct kvm *kvm = vcpu->kvm;
+ u64 base = vcpu->arch.steal.base;
+ int idx;
+
+ if (base == GPA_INVALID)
+ return base;
+
+ /*
+ * Start counting stolen time from the time the guest requests
+ * the feature enabled.
+ */
+ vcpu->arch.steal.steal = 0;
+ vcpu->arch.steal.last_steal = current->sched_info.run_delay;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ kvm_write_guest(kvm, base, &init_values, sizeof(init_values));
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return base;
+}
+
+int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *user = (u64 __user *)attr->addr;
+ struct kvm *kvm = vcpu->kvm;
+ u64 ipa;
+ int ret = 0;
+ int idx;
+
+ if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
+ return -ENXIO;
+
+ if (get_user(ipa, user))
+ return -EFAULT;
+ if (!IS_ALIGNED(ipa, 64))
+ return -EINVAL;
+ if (vcpu->arch.steal.base != GPA_INVALID)
+ return -EEXIST;
+
+ /* Check the address is in a valid memslot */
+ idx = srcu_read_lock(&kvm->srcu);
+ if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT)))
+ ret = -EINVAL;
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (!ret)
+ vcpu->arch.steal.base = ipa;
+
+ return ret;
+}
+
+int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *user = (u64 __user *)attr->addr;
+ u64 ipa;
+
+ if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
+ return -ENXIO;
+
+ ipa = vcpu->arch.steal.base;
+
+ if (put_user(ipa, user))
+ return -EFAULT;
+ return 0;
+}
+
+int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case KVM_ARM_VCPU_PVTIME_IPA:
+ return 0;
+ }
+ return -ENXIO;
+}
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 6f50c429196d..b3c5de48064c 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -203,6 +203,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
raw_spin_lock_init(&vgic_cpu->ap_list_lock);
+ atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
/*
* Enable and configure all SGIs to be edge-triggered and
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 2be6b66b3856..98c7360d9fb7 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -360,7 +360,10 @@ static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
if (ret)
return ret;
+ if (map.vpe)
+ atomic_dec(&map.vpe->vlpi_count);
map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+ atomic_inc(&map.vpe->vlpi_count);
ret = its_map_vlpi(irq->host_irq, &map);
}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 8d69f007dd0c..f45635a6f0ec 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -357,14 +357,14 @@ retry:
}
/**
- * vgic_its_save_pending_tables - Save the pending tables into guest RAM
+ * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
* kvm lock and all vcpu lock must be held
*/
int vgic_v3_save_pending_tables(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- int last_byte_offset = -1;
struct vgic_irq *irq;
+ gpa_t last_ptr = ~(gpa_t)0;
int ret;
u8 val;
@@ -384,11 +384,11 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
bit_nr = irq->intid % BITS_PER_BYTE;
ptr = pendbase + byte_offset;
- if (byte_offset != last_byte_offset) {
+ if (ptr != last_ptr) {
ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
if (ret)
return ret;
- last_byte_offset = byte_offset;
+ last_ptr = ptr;
}
stored = val & (1U << bit_nr);
@@ -664,6 +664,8 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
if (has_vhe())
__vgic_v3_activate_traps(vcpu);
+
+ WARN_ON(vgic_v4_load(vcpu));
}
void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
@@ -676,6 +678,8 @@ void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
void vgic_v3_put(struct kvm_vcpu *vcpu)
{
+ WARN_ON(vgic_v4_put(vcpu, false));
+
vgic_v3_vmcr_sync(vcpu);
kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
diff --git a/virt/kvm/arm/vgic/vgic-v4.c b/virt/kvm/arm/vgic/vgic-v4.c
index 477af6aebb97..46f875589c47 100644
--- a/virt/kvm/arm/vgic/vgic-v4.c
+++ b/virt/kvm/arm/vgic/vgic-v4.c
@@ -85,6 +85,10 @@ static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info)
{
struct kvm_vcpu *vcpu = info;
+ /* We got the message, no need to fire again */
+ if (!irqd_irq_disabled(&irq_to_desc(irq)->irq_data))
+ disable_irq_nosync(irq);
+
vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true;
kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
kvm_vcpu_kick(vcpu);
@@ -192,20 +196,30 @@ void vgic_v4_teardown(struct kvm *kvm)
its_vm->vpes = NULL;
}
-int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu)
+int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
{
- if (!vgic_supports_direct_msis(vcpu->kvm))
+ struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+ struct irq_desc *desc = irq_to_desc(vpe->irq);
+
+ if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
return 0;
- return its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, false);
+ /*
+ * If blocking, a doorbell is required. Undo the nested
+ * disable_irq() calls...
+ */
+ while (need_db && irqd_irq_disabled(&desc->irq_data))
+ enable_irq(vpe->irq);
+
+ return its_schedule_vpe(vpe, false);
}
-int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu)
+int vgic_v4_load(struct kvm_vcpu *vcpu)
{
- int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq;
+ struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
int err;
- if (!vgic_supports_direct_msis(vcpu->kvm))
+ if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
return 0;
/*
@@ -214,11 +228,14 @@ int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu)
* doc in drivers/irqchip/irq-gic-v4.c to understand how this
* turns into a VMOVP command at the ITS level.
*/
- err = irq_set_affinity(irq, cpumask_of(smp_processor_id()));
+ err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id()));
if (err)
return err;
- err = its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, true);
+ /* Disabled the doorbell, as we're about to enter the guest */
+ disable_irq_nosync(vpe->irq);
+
+ err = its_schedule_vpe(vpe, true);
if (err)
return err;
@@ -226,9 +243,7 @@ int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu)
* Now that the VPE is resident, let's get rid of a potential
* doorbell interrupt that would still be pending.
*/
- err = irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, false);
-
- return err;
+ return irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false);
}
static struct vgic_its *vgic_get_its(struct kvm *kvm,
@@ -266,7 +281,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
mutex_lock(&its->its_lock);
- /* Perform then actual DevID/EventID -> LPI translation. */
+ /* Perform the actual DevID/EventID -> LPI translation. */
ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
irq_entry->msi.data, &irq);
if (ret)
@@ -294,6 +309,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
irq->hw = true;
irq->host_irq = virq;
+ atomic_inc(&map.vpe->vlpi_count);
out:
mutex_unlock(&its->its_lock);
@@ -327,6 +343,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
WARN_ON(!(irq->hw && irq->host_irq == virq));
if (irq->hw) {
+ atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
irq->hw = false;
ret = its_unmap_vlpi(virq);
}
@@ -335,21 +352,3 @@ out:
mutex_unlock(&its->its_lock);
return ret;
}
-
-void kvm_vgic_v4_enable_doorbell(struct kvm_vcpu *vcpu)
-{
- if (vgic_supports_direct_msis(vcpu->kvm)) {
- int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq;
- if (irq)
- enable_irq(irq);
- }
-}
-
-void kvm_vgic_v4_disable_doorbell(struct kvm_vcpu *vcpu)
-{
- if (vgic_supports_direct_msis(vcpu->kvm)) {
- int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq;
- if (irq)
- disable_irq(irq);
- }
-}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 45a870cb63f5..99b02ca730a8 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -857,8 +857,6 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- WARN_ON(vgic_v4_sync_hwstate(vcpu));
-
/* An empty ap_list_head implies used_lrs == 0 */
if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
return;
@@ -882,8 +880,6 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
/* Flush our emulation state into the GIC hardware before entering the guest. */
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
{
- WARN_ON(vgic_v4_flush_hwstate(vcpu));
-
/*
* If there are no virtual interrupts active or pending for this
* VCPU, then there is no work to do and we can bail out without
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 83066a81b16a..c7fefd6b1c80 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -316,7 +316,5 @@ void vgic_its_invalidate_cache(struct kvm *kvm);
bool vgic_supports_direct_msis(struct kvm *kvm);
int vgic_v4_init(struct kvm *kvm);
void vgic_v4_teardown(struct kvm *kvm);
-int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu);
-int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu);
#endif
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 8ffd07e2a160..00c747dbc82e 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -110,14 +110,11 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
int kvm_coalesced_mmio_init(struct kvm *kvm)
{
struct page *page;
- int ret;
- ret = -ENOMEM;
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page)
- goto out_err;
+ return -ENOMEM;
- ret = 0;
kvm->coalesced_mmio_ring = page_address(page);
/*
@@ -128,8 +125,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
spin_lock_init(&kvm->ring_lock);
INIT_LIST_HEAD(&kvm->coalesced_zones);
-out_err:
- return ret;
+ return 0;
}
void kvm_coalesced_mmio_free(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13efc291b1c7..00268290dcbd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -838,6 +838,18 @@ void kvm_put_kvm(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_put_kvm);
+/*
+ * Used to put a reference that was taken on behalf of an object associated
+ * with a user-visible file descriptor, e.g. a vcpu or device, if installation
+ * of the new file descriptor fails and the reference cannot be transferred to
+ * its final owner. In such cases, the caller is still actively using @kvm and
+ * will fail miserably if the refcount unexpectedly hits zero.
+ */
+void kvm_put_kvm_no_destroy(struct kvm *kvm)
+{
+ WARN_ON(refcount_dec_and_test(&kvm->users_count));
+}
+EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
static int kvm_vm_release(struct inode *inode, struct file *filp)
{
@@ -2739,17 +2751,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
goto unlock_vcpu_destroy;
}
- BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+ vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
+ BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
r = create_vcpu_fd(vcpu);
if (r < 0) {
- kvm_put_kvm(kvm);
+ kvm_put_kvm_no_destroy(kvm);
goto unlock_vcpu_destroy;
}
- kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+ kvm->vcpus[vcpu->vcpu_idx] = vcpu;
/*
* Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus
@@ -3115,14 +3128,14 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
return filp->private_data;
}
-static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
#ifdef CONFIG_KVM_MPIC
[KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
[KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
#endif
};
-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
{
if (type >= ARRAY_SIZE(kvm_device_ops_table))
return -ENOSPC;
@@ -3143,7 +3156,7 @@ void kvm_unregister_device_ops(u32 type)
static int kvm_ioctl_create_device(struct kvm *kvm,
struct kvm_create_device *cd)
{
- struct kvm_device_ops *ops = NULL;
+ const struct kvm_device_ops *ops = NULL;
struct kvm_device *dev;
bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
int type;
@@ -3183,7 +3196,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
kvm_get_kvm(kvm);
ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
if (ret < 0) {
- kvm_put_kvm(kvm);
+ kvm_put_kvm_no_destroy(kvm);
mutex_lock(&kvm->lock);
list_del(&dev->vm_node);
mutex_unlock(&kvm->lock);
@@ -4341,12 +4354,12 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
r = kvm_arch_hardware_setup();
if (r < 0)
- goto out_free_0a;
+ goto out_free_1;
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, check_processor_compat, &r, 1);
if (r < 0)
- goto out_free_1;
+ goto out_free_2;
}
r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
@@ -4403,9 +4416,8 @@ out_free_3:
unregister_reboot_notifier(&kvm_reboot_notifier);
cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
out_free_2:
-out_free_1:
kvm_arch_hardware_unsetup();
-out_free_0a:
+out_free_1:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
kvm_irqfd_exit();