aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds2023-07-03 15:32:22 -0700
committerLinus Torvalds2023-07-03 15:32:22 -0700
commite8069f5a8e3bdb5fdeeff895780529388592ee7a (patch)
treece35ab85db9b66a7e488707fccdb33ce54f696dd
parenteded37770c9f80ecd5ba842359c4f1058d9812c3 (diff)
parent255006adb3da71bb75c334453786df781b415f54 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "ARM64: - Eager page splitting optimization for dirty logging, optionally allowing for a VM to avoid the cost of hugepage splitting in the stage-2 fault path. - Arm FF-A proxy for pKVM, allowing a pKVM host to safely interact with services that live in the Secure world. pKVM intervenes on FF-A calls to guarantee the host doesn't misuse memory donated to the hyp or a pKVM guest. - Support for running the split hypervisor with VHE enabled, known as 'hVHE' mode. This is extremely useful for testing the split hypervisor on VHE-only systems, and paves the way for new use cases that depend on having two TTBRs available at EL2. - Generalized framework for configurable ID registers from userspace. KVM/arm64 currently prevents arbitrary CPU feature set configuration from userspace, but the intent is to relax this limitation and allow userspace to select a feature set consistent with the CPU. - Enable the use of Branch Target Identification (FEAT_BTI) in the hypervisor. - Use a separate set of pointer authentication keys for the hypervisor when running in protected mode, as the host is untrusted at runtime. - Ensure timer IRQs are consistently released in the init failure paths. - Avoid trapping CTR_EL0 on systems with Enhanced Virtualization Traps (FEAT_EVT), as it is a register commonly read from userspace. - Erratum workaround for the upcoming AmpereOne part, which has broken hardware A/D state management. RISC-V: - Redirect AMO load/store misaligned traps to KVM guest - Trap-n-emulate AIA in-kernel irqchip for KVM guest - Svnapot support for KVM Guest s390: - New uvdevice secret API - CMM selftest and fixes - fix racy access to target CPU for diag 9c x86: - Fix missing/incorrect #GP checks on ENCLS - Use standard mmu_notifier hooks for handling APIC access page - Drop now unnecessary TR/TSS load after VM-Exit on AMD - Print more descriptive information about the status of SEV and SEV-ES during module load - Add a test for splitting and reconstituting hugepages during and after dirty logging - Add support for CPU pinning in demand paging test - Add support for AMD PerfMonV2, with a variety of cleanups and minor fixes included along the way - Add a "nx_huge_pages=never" option to effectively avoid creating NX hugepage recovery threads (because nx_huge_pages=off can be toggled at runtime) - Move handling of PAT out of MTRR code and dedup SVM+VMX code - Fix output of PIC poll command emulation when there's an interrupt - Add a maintainer's handbook to document KVM x86 processes, preferred coding style, testing expectations, etc. - Misc cleanups, fixes and comments Generic: - Miscellaneous bugfixes and cleanups Selftests: - Generate dependency files so that partial rebuilds work as expected" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (153 commits) Documentation/process: Add a maintainer handbook for KVM x86 Documentation/process: Add a label for the tip tree handbook's coding style KVM: arm64: Fix misuse of KVM_ARM_VCPU_POWER_OFF bit index RISC-V: KVM: Remove unneeded semicolon RISC-V: KVM: Allow Svnapot extension for Guest/VM riscv: kvm: define vcpu_sbi_ext_pmu in header RISC-V: KVM: Expose IMSIC registers as attributes of AIA irqchip RISC-V: KVM: Add in-kernel virtualization of AIA IMSIC RISC-V: KVM: Expose APLIC registers as attributes of AIA irqchip RISC-V: KVM: Add in-kernel emulation of AIA APLIC RISC-V: KVM: Implement device interface for AIA irqchip RISC-V: KVM: Skeletal in-kernel AIA irqchip support RISC-V: KVM: Set kvm_riscv_aia_nr_hgei to zero RISC-V: KVM: Add APLIC related defines RISC-V: KVM: Add IMSIC related defines RISC-V: KVM: Implement guest external interrupt line management KVM: x86: Remove PRIx* definitions as they are solely for user space s390/uv: Update query for secret-UVCs s390/uv: replace scnprintf with sysfs_emit s390/uvdevice: Add 'Lock Secret Store' UVC ...
-rw-r--r--Documentation/arch/arm64/silicon-errata.rst3
-rw-r--r--Documentation/process/maintainer-handbooks.rst1
-rw-r--r--Documentation/process/maintainer-kvm-x86.rst390
-rw-r--r--Documentation/process/maintainer-tip.rst2
-rw-r--r--Documentation/virt/kvm/api.rst27
-rw-r--r--Documentation/virt/kvm/x86/mmu.rst2
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/arm64/Kconfig19
-rw-r--r--arch/arm64/include/asm/cpufeature.h6
-rw-r--r--arch/arm64/include/asm/el2_setup.h27
-rw-r--r--arch/arm64/include/asm/kvm_arm.h7
-rw-r--r--arch/arm64/include/asm/kvm_asm.h4
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h46
-rw-r--r--arch/arm64/include/asm/kvm_host.h61
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h37
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h4
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h79
-rw-r--r--arch/arm64/include/asm/kvm_pkvm.h21
-rw-r--r--arch/arm64/include/asm/sysreg.h1
-rw-r--r--arch/arm64/include/asm/virt.h12
-rw-r--r--arch/arm64/kernel/cpu_errata.c7
-rw-r--r--arch/arm64/kernel/cpufeature.c34
-rw-r--r--arch/arm64/kernel/head.S2
-rw-r--r--arch/arm64/kernel/hyp-stub.S10
-rw-r--r--arch/arm64/kernel/idreg-override.c25
-rw-r--r--arch/arm64/kernel/kaslr.c6
-rw-r--r--arch/arm64/kvm/arch_timer.c14
-rw-r--r--arch/arm64/kvm/arm.c201
-rw-r--r--arch/arm64/kvm/fpsimd.c4
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h101
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/ffa.h17
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h3
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c762
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S36
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S32
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c19
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c74
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c27
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c11
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c28
-rw-r--r--arch/arm64/kvm/hyp/nvhe/timer-sr.c16
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c52
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c228
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c2
-rw-r--r--arch/arm64/kvm/hyp/vhe/tlb.c32
-rw-r--r--arch/arm64/kvm/mmu.c207
-rw-r--r--arch/arm64/kvm/pkvm.c1
-rw-r--r--arch/arm64/kvm/reset.c58
-rw-r--r--arch/arm64/kvm/sys_regs.c505
-rw-r--r--arch/arm64/kvm/sys_regs.h22
-rw-r--r--arch/arm64/tools/cpucaps3
-rw-r--r--arch/riscv/include/asm/csr.h2
-rw-r--r--arch/riscv/include/asm/kvm_aia.h107
-rw-r--r--arch/riscv/include/asm/kvm_aia_aplic.h58
-rw-r--r--arch/riscv/include/asm/kvm_aia_imsic.h38
-rw-r--r--arch/riscv/include/asm/kvm_host.h4
-rw-r--r--arch/riscv/include/asm/kvm_vcpu_sbi.h11
-rw-r--r--arch/riscv/include/uapi/asm/kvm.h73
-rw-r--r--arch/riscv/kvm/Kconfig4
-rw-r--r--arch/riscv/kvm/Makefile3
-rw-r--r--arch/riscv/kvm/aia.c274
-rw-r--r--arch/riscv/kvm/aia_aplic.c619
-rw-r--r--arch/riscv/kvm/aia_device.c673
-rw-r--r--arch/riscv/kvm/aia_imsic.c1084
-rw-r--r--arch/riscv/kvm/main.c3
-rw-r--r--arch/riscv/kvm/tlb.c2
-rw-r--r--arch/riscv/kvm/vcpu.c4
-rw-r--r--arch/riscv/kvm/vcpu_exit.c2
-rw-r--r--arch/riscv/kvm/vcpu_sbi.c80
-rw-r--r--arch/riscv/kvm/vm.c118
-rw-r--r--arch/s390/boot/uv.c4
-rw-r--r--arch/s390/include/asm/uv.h32
-rw-r--r--arch/s390/include/uapi/asm/uvdevice.h53
-rw-r--r--arch/s390/kernel/uv.c108
-rw-r--r--arch/s390/kvm/diag.c8
-rw-r--r--arch/s390/kvm/kvm-s390.c4
-rw-r--r--arch/s390/kvm/vsie.c6
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/kvm/cpuid.c43
-rw-r--r--arch/x86/kvm/i8259.c3
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/mmu/mmu.c53
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c5
-rw-r--r--arch/x86/kvm/mtrr.c64
-rw-r--r--arch/x86/kvm/pmu.c92
-rw-r--r--arch/x86/kvm/pmu.h56
-rw-r--r--arch/x86/kvm/reverse_cpuid.h7
-rw-r--r--arch/x86/kvm/svm/pmu.c68
-rw-r--r--arch/x86/kvm/svm/sev.c19
-rw-r--r--arch/x86/kvm/svm/svm.c56
-rw-r--r--arch/x86/kvm/svm/svm.h1
-rw-r--r--arch/x86/kvm/vmx/capabilities.h4
-rw-r--r--arch/x86/kvm/vmx/nested.c7
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c79
-rw-r--r--arch/x86/kvm/vmx/sgx.c15
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c77
-rw-r--r--arch/x86/kvm/vmx/vmx.h12
-rw-r--r--arch/x86/kvm/x86.c80
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--drivers/s390/char/Kconfig2
-rw-r--r--drivers/s390/char/uvdevice.c231
-rw-r--r--include/kvm/arm_pmu.h8
-rw-r--r--include/kvm/iodev.h6
-rw-r--r--include/linux/arm_ffa.h8
-rw-r--r--include/linux/kvm_host.h9
-rw-r--r--include/uapi/linux/kvm.h6
-rw-r--r--tools/testing/selftests/kvm/Makefile19
-rw-r--r--tools/testing/selftests/kvm/demand_paging_test.c32
-rw-r--r--tools/testing/selftests/kvm/dirty_log_perf_test.c96
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util_base.h1
-rw-r--r--tools/testing/selftests/kvm/include/memstress.h8
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c17
-rw-r--r--tools/testing/selftests/kvm/lib/memstress.c75
-rw-r--r--tools/testing/selftests/kvm/lib/userfaultfd_util.c4
-rw-r--r--tools/testing/selftests/kvm/s390x/cmma_test.c700
-rw-r--r--tools/testing/selftests/kvm/x86_64/cpuid_test.c21
-rw-r--r--tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c259
-rw-r--r--tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c2
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c22
-rw-r--r--virt/kvm/coalesced_mmio.c9
-rw-r--r--virt/kvm/eventfd.c8
-rw-r--r--virt/kvm/kvm_main.c51
125 files changed, 8004 insertions, 1007 deletions
diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst
index f093a9d8bc5c..496cdca5cb99 100644
--- a/Documentation/arch/arm64/silicon-errata.rst
+++ b/Documentation/arch/arm64/silicon-errata.rst
@@ -52,6 +52,9 @@ stable kernels.
| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
+----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
+| Ampere | AmpereOne | AC03_CPU_38 | AMPERE_ERRATUM_AC03_CPU_38 |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 |
diff --git a/Documentation/process/maintainer-handbooks.rst b/Documentation/process/maintainer-handbooks.rst
index fe24cb665fb7..9992bfd7eaa3 100644
--- a/Documentation/process/maintainer-handbooks.rst
+++ b/Documentation/process/maintainer-handbooks.rst
@@ -18,3 +18,4 @@ Contents:
maintainer-netdev
maintainer-soc
maintainer-tip
+ maintainer-kvm-x86
diff --git a/Documentation/process/maintainer-kvm-x86.rst b/Documentation/process/maintainer-kvm-x86.rst
new file mode 100644
index 000000000000..9183bd449762
--- /dev/null
+++ b/Documentation/process/maintainer-kvm-x86.rst
@@ -0,0 +1,390 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+KVM x86
+=======
+
+Foreword
+--------
+KVM strives to be a welcoming community; contributions from newcomers are
+valued and encouraged. Please do not be discouraged or intimidated by the
+length of this document and the many rules/guidelines it contains. Everyone
+makes mistakes, and everyone was a newbie at some point. So long as you make
+an honest effort to follow KVM x86's guidelines, are receptive to feedback,
+and learn from any mistakes you make, you will be welcomed with open arms, not
+torches and pitchforks.
+
+TL;DR
+-----
+Testing is mandatory. Be consistent with established styles and patterns.
+
+Trees
+-----
+KVM x86 is currently in a transition period from being part of the main KVM
+tree, to being "just another KVM arch". As such, KVM x86 is split across the
+main KVM tree, ``git.kernel.org/pub/scm/virt/kvm/kvm.git``, and a KVM x86
+specific tree, ``github.com/kvm-x86/linux.git``.
+
+Generally speaking, fixes for the current cycle are applied directly to the
+main KVM tree, while all development for the next cycle is routed through the
+KVM x86 tree. In the unlikely event that a fix for the current cycle is routed
+through the KVM x86 tree, it will be applied to the ``fixes`` branch before
+making its way to the main KVM tree.
+
+Note, this transition period is expected to last quite some time, i.e. will be
+the status quo for the foreseeable future.
+
+Branches
+~~~~~~~~
+The KVM x86 tree is organized into multiple topic branches. The purpose of
+using finer-grained topic branches is to make it easier to keep tabs on an area
+of development, and to limit the collateral damage of human errors and/or buggy
+commits, e.g. dropping the HEAD commit of a topic branch has no impact on other
+in-flight commits' SHA1 hashes, and having to reject a pull request due to bugs
+delays only that topic branch.
+
+All topic branches, except for ``next`` and ``fixes``, are rolled into ``next``
+via a Cthulhu merge on an as-needed basis, i.e. when a topic branch is updated.
+As a result, force pushes to ``next`` are common.
+
+Lifecycle
+~~~~~~~~~
+Fixes that target the current release, a.k.a. mainline, are typically applied
+directly to the main KVM tree, i.e. do not route through the KVM x86 tree.
+
+Changes that target the next release are routed through the KVM x86 tree. Pull
+requests (from KVM x86 to main KVM) are sent for each KVM x86 topic branch,
+typically the week before Linus' opening of the merge window, e.g. the week
+following rc7 for "normal" releases. If all goes well, the topic branches are
+rolled into the main KVM pull request sent during Linus' merge window.
+
+The KVM x86 tree doesn't have its own official merge window, but there's a soft
+close around rc5 for new features, and a soft close around rc6 for fixes (for
+the next release; see above for fixes that target the current release).
+
+Timeline
+~~~~~~~~
+Submissions are typically reviewed and applied in FIFO order, with some wiggle
+room for the size of a series, patches that are "cache hot", etc. Fixes,
+especially for the current release and or stable trees, get to jump the queue.
+Patches that will be taken through a non-KVM tree (most often through the tip
+tree) and/or have other acks/reviews also jump the queue to some extent.
+
+Note, the vast majority of review is done between rc1 and rc6, give or take.
+The period between rc6 and the next rc1 is used to catch up on other tasks,
+i.e. radio silence during this period isn't unusual.
+
+Pings to get a status update are welcome, but keep in mind the timing of the
+current release cycle and have realistic expectations. If you are pinging for
+acceptance, i.e. not just for feedback or an update, please do everything you
+can, within reason, to ensure that your patches are ready to be merged! Pings
+on series that break the build or fail tests lead to unhappy maintainers!
+
+Development
+-----------
+
+Base Tree/Branch
+~~~~~~~~~~~~~~~~
+Fixes that target the current release, a.k.a. mainline, should be based on
+``git://git.kernel.org/pub/scm/virt/kvm/kvm.git master``. Note, fixes do not
+automatically warrant inclusion in the current release. There is no singular
+rule, but typically only fixes for bugs that are urgent, critical, and/or were
+introduced in the current release should target the current release.
+
+Everything else should be based on ``kvm-x86/next``, i.e. there is no need to
+select a specific topic branch as the base. If there are conflicts and/or
+dependencies across topic branches, it is the maintainer's job to sort them
+out.
+
+The only exception to using ``kvm-x86/next`` as the base is if a patch/series
+is a multi-arch series, i.e. has non-trivial modifications to common KVM code
+and/or has more than superficial changes to other architectures' code. Multi-
+arch patch/series should instead be based on a common, stable point in KVM's
+history, e.g. the release candidate upon which ``kvm-x86 next`` is based. If
+you're unsure whether a patch/series is truly multi-arch, err on the side of
+caution and treat it as multi-arch, i.e. use a common base.
+
+Coding Style
+~~~~~~~~~~~~
+When it comes to style, naming, patterns, etc., consistency is the number one
+priority in KVM x86. If all else fails, match what already exists.
+
+With a few caveats listed below, follow the tip tree maintainers' preferred
+:ref:`maintainer-tip-coding-style`, as patches/series often touch both KVM and
+non-KVM x86 files, i.e. draw the attention of KVM *and* tip tree maintainers.
+
+Using reverse fir tree, a.k.a. reverse Christmas tree or reverse XMAS tree, for
+variable declarations isn't strictly required, though it is still preferred.
+
+Except for a handful of special snowflakes, do not use kernel-doc comments for
+functions. The vast majority of "public" KVM functions aren't truly public as
+they are intended only for KVM-internal consumption (there are plans to
+privatize KVM's headers and exports to enforce this).
+
+Comments
+~~~~~~~~
+Write comments using imperative mood and avoid pronouns. Use comments to
+provide a high level overview of the code, and/or to explain why the code does
+what it does. Do not reiterate what the code literally does; let the code
+speak for itself. If the code itself is inscrutable, comments will not help.
+
+SDM and APM References
+~~~~~~~~~~~~~~~~~~~~~~
+Much of KVM's code base is directly tied to architectural behavior defined in
+Intel's Software Development Manual (SDM) and AMD's Architecture Programmer’s
+Manual (APM). Use of "Intel's SDM" and "AMD's APM", or even just "SDM" or
+"APM", without additional context is a-ok.
+
+Do not reference specific sections, tables, figures, etc. by number, especially
+not in comments. Instead, if necessary (see below), copy-paste the relevant
+snippet and reference sections/tables/figures by name. The layouts of the SDM
+and APM are constantly changing, and so the numbers/labels aren't stable.
+
+Generally speaking, do not explicitly reference or copy-paste from the SDM or
+APM in comments. With few exceptions, KVM *must* honor architectural behavior,
+therefore it's implied that KVM behavior is emulating SDM and/or APM behavior.
+Note, referencing the SDM/APM in changelogs to justify the change and provide
+context is perfectly ok and encouraged.
+
+Shortlog
+~~~~~~~~
+The preferred prefix format is ``KVM: <topic>:``, where ``<topic>`` is one of::
+
+ - x86
+ - x86/mmu
+ - x86/pmu
+ - x86/xen
+ - selftests
+ - SVM
+ - nSVM
+ - VMX
+ - nVMX
+
+**DO NOT use x86/kvm!** ``x86/kvm`` is used exclusively for Linux-as-a-KVM-guest
+changes, i.e. for arch/x86/kernel/kvm.c. Do not use file names or complete file
+paths as the subject/shortlog prefix.
+
+Note, these don't align with the topics branches (the topic branches care much
+more about code conflicts).
+
+All names are case sensitive! ``KVM: x86:`` is good, ``kvm: vmx:`` is not.
+
+Capitalize the first word of the condensed patch description, but omit ending
+punctionation. E.g.::
+
+ KVM: x86: Fix a null pointer dereference in function_xyz()
+
+not::
+
+ kvm: x86: fix a null pointer dereference in function_xyz.
+
+If a patch touches multiple topics, traverse up the conceptual tree to find the
+first common parent (which is often simply ``x86``). When in doubt,
+``git log path/to/file`` should provide a reasonable hint.
+
+New topics do occasionally pop up, but please start an on-list discussion if
+you want to propose introducing a new topic, i.e. don't go rogue.
+
+See :ref:`the_canonical_patch_format` for more information, with one amendment:
+do not treat the 70-75 character limit as an absolute, hard limit. Instead,
+use 75 characters as a firm-but-not-hard limit, and use 80 characters as a hard
+limit. I.e. let the shortlog run a few characters over the standard limit if
+you have good reason to do so.
+
+Changelog
+~~~~~~~~~
+Most importantly, write changelogs using imperative mood and avoid pronouns.
+
+See :ref:`describe_changes` for more information, with one amendment: lead with
+a short blurb on the actual changes, and then follow up with the context and
+background. Note! This order directly conflicts with the tip tree's preferred
+approach! Please follow the tip tree's preferred style when sending patches
+that primarily target arch/x86 code that is _NOT_ KVM code.
+
+Stating what a patch does before diving into details is preferred by KVM x86
+for several reasons. First and foremost, what code is actually being changed
+is arguably the most important information, and so that info should be easy to
+find. Changelogs that bury the "what's actually changing" in a one-liner after
+3+ paragraphs of background make it very hard to find that information.
+
+For initial review, one could argue the "what's broken" is more important, but
+for skimming logs and git archaeology, the gory details matter less and less.
+E.g. when doing a series of "git blame", the details of each change along the
+way are useless, the details only matter for the culprit. Providing the "what
+changed" makes it easy to quickly determine whether or not a commit might be of
+interest.
+
+Another benefit of stating "what's changing" first is that it's almost always
+possible to state "what's changing" in a single sentence. Conversely, all but
+the most simple bugs require multiple sentences or paragraphs to fully describe
+the problem. If both the "what's changing" and "what's the bug" are super
+short then the order doesn't matter. But if one is shorter (almost always the
+"what's changing), then covering the shorter one first is advantageous because
+it's less of an inconvenience for readers/reviewers that have a strict ordering
+preference. E.g. having to skip one sentence to get to the context is less
+painful than having to skip three paragraphs to get to "what's changing".
+
+Fixes
+~~~~~
+If a change fixes a KVM/kernel bug, add a Fixes: tag even if the change doesn't
+need to be backported to stable kernels, and even if the change fixes a bug in
+an older release.
+
+Conversely, if a fix does need to be backported, explicitly tag the patch with
+"Cc: stable@vger.kernel" (though the email itself doesn't need to Cc: stable);
+KVM x86 opts out of backporting Fixes: by default. Some auto-selected patches
+do get backported, but require explicit maintainer approval (search MANUALSEL).
+
+Function References
+~~~~~~~~~~~~~~~~~~~
+When a function is mentioned in a comment, changelog, or shortlog (or anywhere
+for that matter), use the format ``function_name()``. The parentheses provide
+context and disambiguate the reference.
+
+Testing
+-------
+At a bare minimum, *all* patches in a series must build cleanly for KVM_INTEL=m
+KVM_AMD=m, and KVM_WERROR=y. Building every possible combination of Kconfigs
+isn't feasible, but the more the merrier. KVM_SMM, KVM_XEN, PROVE_LOCKING, and
+X86_64 are particularly interesting knobs to turn.
+
+Running KVM selftests and KVM-unit-tests is also mandatory (and stating the
+obvious, the tests need to pass). The only exception is for changes that have
+negligible probability of affecting runtime behavior, e.g. patches that only
+modify comments. When possible and relevant, testing on both Intel and AMD is
+strongly preferred. Booting an actual VM is encouraged, but not mandatory.
+
+For changes that touch KVM's shadow paging code, running with TDP (EPT/NPT)
+disabled is mandatory. For changes that affect common KVM MMU code, running
+with TDP disabled is strongly encouraged. For all other changes, if the code
+being modified depends on and/or interacts with a module param, testing with
+the relevant settings is mandatory.
+
+Note, KVM selftests and KVM-unit-tests do have known failures. If you suspect
+a failure is not due to your changes, verify that the *exact same* failure
+occurs with and without your changes.
+
+Changes that touch reStructured Text documentation, i.e. .rst files, must build
+htmldocs cleanly, i.e. with no new warnings or errors.
+
+If you can't fully test a change, e.g. due to lack of hardware, clearly state
+what level of testing you were able to do, e.g. in the cover letter.
+
+New Features
+~~~~~~~~~~~~
+With one exception, new features *must* come with test coverage. KVM specific
+tests aren't strictly required, e.g. if coverage is provided by running a
+sufficiently enabled guest VM, or by running a related kernel selftest in a VM,
+but dedicated KVM tests are preferred in all cases. Negative testcases in
+particular are mandatory for enabling of new hardware features as error and
+exception flows are rarely exercised simply by running a VM.
+
+The only exception to this rule is if KVM is simply advertising support for a
+feature via KVM_GET_SUPPORTED_CPUID, i.e. for instructions/features that KVM
+can't prevent a guest from using and for which there is no true enabling.
+
+Note, "new features" does not just mean "new hardware features"! New features
+that can't be well validated using existing KVM selftests and/or KVM-unit-tests
+must come with tests.
+
+Posting new feature development without tests to get early feedback is more
+than welcome, but such submissions should be tagged RFC, and the cover letter
+should clearly state what type of feedback is requested/expected. Do not abuse
+the RFC process; RFCs will typically not receive in-depth review.
+
+Bug Fixes
+~~~~~~~~~
+Except for "obvious" found-by-inspection bugs, fixes must be accompanied by a
+reproducer for the bug being fixed. In many cases the reproducer is implicit,
+e.g. for build errors and test failures, but it should still be clear to
+readers what is broken and how to verify the fix. Some leeway is given for
+bugs that are found via non-public workloads/tests, but providing regression
+tests for such bugs is strongly preferred.
+
+In general, regression tests are preferred for any bug that is not trivial to
+hit. E.g. even if the bug was originally found by a fuzzer such as syzkaller,
+a targeted regression test may be warranted if the bug requires hitting a
+one-in-a-million type race condition.
+
+Note, KVM bugs are rarely urgent *and* non-trivial to reproduce. Ask yourself
+if a bug is really truly the end of the world before posting a fix without a
+reproducer.
+
+Posting
+-------
+
+Links
+~~~~~
+Do not explicitly reference bug reports, prior versions of a patch/series, etc.
+via ``In-Reply-To:`` headers. Using ``In-Reply-To:`` becomes an unholy mess
+for large series and/or when the version count gets high, and ``In-Reply-To:``
+is useless for anyone that doesn't have the original message, e.g. if someone
+wasn't Cc'd on the bug report or if the list of recipients changes between
+versions.
+
+To link to a bug report, previous version, or anything of interest, use lore
+links. For referencing previous version(s), generally speaking do not include
+a Link: in the changelog as there is no need to record the history in git, i.e.
+put the link in the cover letter or in the section git ignores. Do provide a
+formal Link: for bug reports and/or discussions that led to the patch. The
+context of why a change was made is highly valuable for future readers.
+
+Git Base
+~~~~~~~~
+If you are using git version 2.9.0 or later (Googlers, this is all of you!),
+use ``git format-patch`` with the ``--base`` flag to automatically include the
+base tree information in the generated patches.
+
+Note, ``--base=auto`` works as expected if and only if a branch's upstream is
+set to the base topic branch, e.g. it will do the wrong thing if your upstream
+is set to your personal repository for backup purposes. An alternative "auto"
+solution is to derive the names of your development branches based on their
+KVM x86 topic, and feed that into ``--base``. E.g. ``x86/pmu/my_branch_name``,
+and then write a small wrapper to extract ``pmu`` from the current branch name
+to yield ``--base=x/pmu``, where ``x`` is whatever name your repository uses to
+track the KVM x86 remote.
+
+Co-Posting Tests
+~~~~~~~~~~~~~~~~
+KVM selftests that are associated with KVM changes, e.g. regression tests for
+bug fixes, should be posted along with the KVM changes as a single series. The
+standard kernel rules for bisection apply, i.e. KVM changes that result in test
+failures should be ordered after the selftests updates, and vice versa, new
+tests that fail due to KVM bugs should be ordered after the KVM fixes.
+
+KVM-unit-tests should *always* be posted separately. Tools, e.g. b4 am, don't
+know that KVM-unit-tests is a separate repository and get confused when patches
+in a series apply on different trees. To tie KVM-unit-tests patches back to
+KVM patches, first post the KVM changes and then provide a lore Link: to the
+KVM patch/series in the KVM-unit-tests patch(es).
+
+Notifications
+-------------
+When a patch/series is officially accepted, a notification email will be sent
+in reply to the original posting (cover letter for multi-patch series). The
+notification will include the tree and topic branch, along with the SHA1s of
+the commits of applied patches.
+
+If a subset of patches is applied, this will be clearly stated in the
+notification. Unless stated otherwise, it's implied that any patches in the
+series that were not accepted need more work and should be submitted in a new
+version.
+
+If for some reason a patch is dropped after officially being accepted, a reply
+will be sent to the notification email explaining why the patch was dropped, as
+well as the next steps.
+
+SHA1 Stability
+~~~~~~~~~~~~~~
+SHA1s are not 100% guaranteed to be stable until they land in Linus' tree! A
+SHA1 is *usually* stable once a notification has been sent, but things happen.
+In most cases, an update to the notification email be provided if an applied
+patch's SHA1 changes. However, in some scenarios, e.g. if all KVM x86 branches
+need to be rebased, individual notifications will not be given.
+
+Vulnerabilities
+---------------
+Bugs that can be exploited by the guest to attack the host (kernel or
+userspace), or that can be exploited by a nested VM to *its* host (L2 attacking
+L1), are of particular interest to KVM. Please follow the protocol for
+:ref:`securitybugs` if you suspect a bug can lead to an escape, data leak, etc.
+
diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst
index 93d8a794bdfc..08dd0f804410 100644
--- a/Documentation/process/maintainer-tip.rst
+++ b/Documentation/process/maintainer-tip.rst
@@ -455,6 +455,8 @@ and can be added to an existing kernel config by running:
Some of these options are x86-specific and can be left out when testing
on other architectures.
+.. _maintainer-tip-coding-style:
+
Coding style notes
------------------
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 96c4475539c2..c0ddd3035462 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8445,6 +8445,33 @@ structure.
When getting the Modified Change Topology Report value, the attr->addr
must point to a byte where the value will be stored or retrieved from.
+8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
+---------------------------------------
+
+:Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
+:Architectures: arm64
+:Type: vm
+:Parameters: arg[0] is the new split chunk size.
+:Returns: 0 on success, -EINVAL if any memslot was already created.
+
+This capability sets the chunk size used in Eager Page Splitting.
+
+Eager Page Splitting improves the performance of dirty-logging (used
+in live migrations) when guest memory is backed by huge-pages. It
+avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing
+it eagerly when enabling dirty logging (with the
+KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using
+KVM_CLEAR_DIRTY_LOG.
+
+The chunk size specifies how many pages to break at a time, using a
+single allocation for each chunk. Bigger the chunk size, more pages
+need to be allocated ahead of time.
+
+The chunk size needs to be a valid block size. The list of acceptable
+block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a
+64-bit bitmap (each bit describing a block size). The default value is
+0, to disable the eager page splitting.
+
9. Known KVM API problems
=========================
diff --git a/Documentation/virt/kvm/x86/mmu.rst b/Documentation/virt/kvm/x86/mmu.rst
index 8364afa228ec..26f62034b6f3 100644
--- a/Documentation/virt/kvm/x86/mmu.rst
+++ b/Documentation/virt/kvm/x86/mmu.rst
@@ -205,7 +205,7 @@ Shadow pages contain the following information:
role.passthrough:
The page is not backed by a guest page table, but its first entry
points to one. This is set if NPT uses 5-level page tables (host
- CR4.LA57=1) and is shadowing L1's 4-level NPT (L1 CR4.LA57=1).
+ CR4.LA57=1) and is shadowing L1's 4-level NPT (L1 CR4.LA57=0).
gfn:
Either the guest page table containing the translations shadowed by this
page, or the base page frame for linear translations. See role.direct.
diff --git a/MAINTAINERS b/MAINTAINERS
index 33dd25d4149f..d7d65163e54e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11546,6 +11546,7 @@ M: Sean Christopherson <seanjc@google.com>
M: Paolo Bonzini <pbonzini@redhat.com>
L: kvm@vger.kernel.org
S: Supported
+P: Documentation/process/maintainer-kvm-x86.rst
T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git
F: arch/x86/include/asm/kvm*
F: arch/x86/include/asm/svm.h
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 891ab530a665..7856c3a3e35a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -414,6 +414,25 @@ menu "Kernel Features"
menu "ARM errata workarounds via the alternatives framework"
+config AMPERE_ERRATUM_AC03_CPU_38
+ bool "AmpereOne: AC03_CPU_38: Certain bits in the Virtualization Translation Control Register and Translation Control Registers do not follow RES0 semantics"
+ default y
+ help
+ This option adds an alternative code sequence to work around Ampere
+ erratum AC03_CPU_38 on AmpereOne.
+
+ The affected design reports FEAT_HAFDBS as not implemented in
+ ID_AA64MMFR1_EL1.HAFDBS, but (V)TCR_ELx.{HA,HD} are not RES0
+ as required by the architecture. The unadvertised HAFDBS
+ implementation suffers from an additional erratum where hardware
+ A/D updates can occur after a PTE has been marked invalid.
+
+ The workaround forces KVM to explicitly set VTCR_EL2.HA to 0,
+ which avoids enabling unadvertised hardware Access Flag management
+ at stage-2.
+
+ If unsure, say Y.
+
config ARM64_WORKAROUND_CLEAN_CACHE
bool
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 7a95c324e52a..96e50227f940 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -15,6 +15,9 @@
#define MAX_CPU_FEATURES 128
#define cpu_feature(x) KERNEL_HWCAP_ ## x
+#define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0
+#define ARM64_SW_FEATURE_OVERRIDE_HVHE 4
+
#ifndef __ASSEMBLY__
#include <linux/bug.h>
@@ -905,6 +908,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1)
return 8;
}
+s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur);
struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id);
extern struct arm64_ftr_override id_aa64mmfr1_override;
@@ -915,6 +919,8 @@ extern struct arm64_ftr_override id_aa64smfr0_override;
extern struct arm64_ftr_override id_aa64isar1_override;
extern struct arm64_ftr_override id_aa64isar2_override;
+extern struct arm64_ftr_override arm64_sw_feature_override;
+
u32 get_kvm_ipa_limit(void);
void dump_cpu_features(void);
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index f4c3d30bf746..8e5ffb58f83e 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -43,6 +43,11 @@
*/
.macro __init_el2_timers
mov x0, #3 // Enable EL1 physical timers
+ mrs x1, hcr_el2
+ and x1, x1, #HCR_E2H
+ cbz x1, .LnVHE_\@
+ lsl x0, x0, #10
+.LnVHE_\@:
msr cnthctl_el2, x0
msr cntvoff_el2, xzr // Clear virtual offset
.endm
@@ -133,8 +138,15 @@
.endm
/* Coprocessor traps */
-.macro __init_el2_nvhe_cptr
+.macro __init_el2_cptr
+ mrs x1, hcr_el2
+ and x1, x1, #HCR_E2H
+ cbz x1, .LnVHE_\@
+ mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN)
+ b .Lset_cptr_\@
+.LnVHE_\@:
mov x0, #0x33ff
+.Lset_cptr_\@:
msr cptr_el2, x0 // Disable copro. traps to EL2
.endm
@@ -210,9 +222,8 @@
__init_el2_gicv3
__init_el2_hstr
__init_el2_nvhe_idregs
- __init_el2_nvhe_cptr
+ __init_el2_cptr
__init_el2_fgt
- __init_el2_nvhe_prepare_eret
.endm
#ifndef __KVM_NVHE_HYPERVISOR__
@@ -258,7 +269,17 @@
.Linit_sve_\@: /* SVE register access */
mrs x0, cptr_el2 // Disable SVE traps
+ mrs x1, hcr_el2
+ and x1, x1, #HCR_E2H
+ cbz x1, .Lcptr_nvhe_\@
+
+ // VHE case
+ orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN)
+ b .Lset_cptr_\@
+
+.Lcptr_nvhe_\@: // nVHE case
bic x0, x0, #CPTR_EL2_TZ
+.Lset_cptr_\@:
msr cptr_el2, x0
isb
mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index c6e12e8f2751..58e5eb27da68 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -19,6 +19,7 @@
#define HCR_ATA_SHIFT 56
#define HCR_ATA (UL(1) << HCR_ATA_SHIFT)
#define HCR_AMVOFFEN (UL(1) << 51)
+#define HCR_TID4 (UL(1) << 49)
#define HCR_FIEN (UL(1) << 47)
#define HCR_FWB (UL(1) << 46)
#define HCR_API (UL(1) << 41)
@@ -87,7 +88,7 @@
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
HCR_BSU_IS | HCR_FB | HCR_TACR | \
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
- HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID2)
+ HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3)
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
@@ -289,7 +290,6 @@
#define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT)
#define CPTR_EL2_TZ (1 << 8)
#define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */
-#define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1
#define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \
GENMASK(29, 21) | \
GENMASK(19, 14) | \
@@ -351,8 +351,7 @@
ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
ECN(BKPT32), ECN(VECTOR32), ECN(BRK64), ECN(ERET)
-#define CPACR_EL1_DEFAULT (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |\
- CPACR_EL1_ZEN_EL1EN)
+#define CPACR_EL1_TTA (1 << 28)
#define kvm_mode_names \
{ PSR_MODE_EL0t, "EL0t" }, \
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 86042afa86c3..7d170aaa2db4 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -68,6 +68,7 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
+ __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa_nsh,
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
@@ -225,6 +226,9 @@ extern void __kvm_flush_vm_context(void);
extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
int level);
+extern void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa,
+ int level);
extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index b31b32ecbe2d..efc0b45d79c3 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -62,19 +62,14 @@ static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
#else
static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
-
- WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED,
- &kvm->arch.flags));
-
- return test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags);
+ return test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features);
}
#endif
static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
- if (is_kernel_in_hyp_mode())
+ if (has_vhe() || has_hvhe())
vcpu->arch.hcr_el2 |= HCR_E2H;
if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) {
/* route synchronous external abort exceptions to EL2 */
@@ -95,6 +90,12 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
vcpu->arch.hcr_el2 |= HCR_TVM;
}
+ if (cpus_have_final_cap(ARM64_HAS_EVT) &&
+ !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE))
+ vcpu->arch.hcr_el2 |= HCR_TID4;
+ else
+ vcpu->arch.hcr_el2 |= HCR_TID2;
+
if (vcpu_el1_is_32bit(vcpu))
vcpu->arch.hcr_el2 &= ~HCR_RW;
@@ -570,4 +571,35 @@ static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature)
return test_bit(feature, vcpu->arch.features);
}
+static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu)
+{
+ u64 val;
+
+ if (has_vhe()) {
+ val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |
+ CPACR_EL1_ZEN_EL1EN);
+ } else if (has_hvhe()) {
+ val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN);
+ } else {
+ val = CPTR_NVHE_EL2_RES1;
+
+ if (vcpu_has_sve(vcpu) &&
+ (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED))
+ val |= CPTR_EL2_TZ;
+ if (cpus_have_final_cap(ARM64_SME))
+ val &= ~CPTR_EL2_TSM;
+ }
+
+ return val;
+}
+
+static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu)
+{
+ u64 val = kvm_get_reset_cptr_el2(vcpu);
+
+ if (has_vhe() || has_hvhe())
+ write_sysreg(val, cpacr_el1);
+ else
+ write_sysreg(val, cptr_el2);
+}
#endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d48609d95423..8b6096753740 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -39,6 +39,7 @@
#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
#define KVM_VCPU_MAX_FEATURES 7
+#define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1)
#define KVM_REQ_SLEEP \
KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
@@ -159,6 +160,21 @@ struct kvm_s2_mmu {
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
+#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0
+ /*
+ * Memory cache used to split
+ * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It
+ * is used to allocate stage2 page tables while splitting huge
+ * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
+ * influences both the capacity of the split page cache, and
+ * how often KVM reschedules. Be wary of raising CHUNK_SIZE
+ * too high.
+ *
+ * Protected by kvm->slots_lock.
+ */
+ struct kvm_mmu_memory_cache split_page_cache;
+ uint64_t split_page_chunk_size;
+
struct kvm_arch *arch;
};
@@ -214,25 +230,23 @@ struct kvm_arch {
#define KVM_ARCH_FLAG_MTE_ENABLED 1
/* At least one vCPU has ran in the VM */
#define KVM_ARCH_FLAG_HAS_RAN_ONCE 2
- /*
- * The following two bits are used to indicate the guest's EL1
- * register width configuration. A value of KVM_ARCH_FLAG_EL1_32BIT
- * bit is valid only when KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED is set.
- * Otherwise, the guest's EL1 register width has not yet been
- * determined yet.
- */
-#define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED 3
-#define KVM_ARCH_FLAG_EL1_32BIT 4
+ /* The vCPU feature set for the VM is configured */
+#define KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED 3
/* PSCI SYSTEM_SUSPEND enabled for the guest */
-#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5
+#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 4
/* VM counter offset */
-#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 6
+#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 5
/* Timer PPIs made immutable */
-#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 7
+#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 6
/* SMCCC filter initialized for the VM */
-#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 8
+#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 7
+ /* Initial ID reg values loaded */
+#define KVM_ARCH_FLAG_ID_REGS_INITIALIZED 8
unsigned long flags;
+ /* VM-wide vCPU feature set */
+ DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES);
+
/*
* VM-wide PMU filter, implemented as a bitmap and big enough for
* up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
@@ -242,18 +256,24 @@ struct kvm_arch {
cpumask_var_t supported_cpus;
- u8 pfr0_csv2;
- u8 pfr0_csv3;
- struct {
- u8 imp:4;
- u8 unimp:4;
- } dfr0_pmuver;
-
/* Hypercall features firmware registers' descriptor */
struct kvm_smccc_features smccc_feat;
struct maple_tree smccc_filter;
/*
+ * Emulated CPU ID registers per VM
+ * (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it
+ * is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8.
+ *
+ * These emulated idregs are VM-wide, but accessed from the context of a vCPU.
+ * Atomic access to multiple idregs are guarded by kvm_arch.config_lock.
+ */
+#define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id))
+#define IDREG(kvm, id) ((kvm)->arch.id_regs[IDREG_IDX(id)])
+#define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1)
+ u64 id_regs[KVM_ARM_ID_REG_NUM];
+
+ /*
* For an untrusted host VM, 'pkvm.handle' is used to lookup
* the associated pKVM instance in the hypervisor.
*/
@@ -410,6 +430,7 @@ struct kvm_host_data {
struct kvm_host_psci_config {
/* PSCI version used by host. */
u32 version;
+ u32 smccc_version;
/* Function IDs used by host if version is v0.1. */
struct psci_0_1_function_ids function_ids_0_1;
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index bdd9cf546d95..b7238c72a04c 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -16,12 +16,35 @@ DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+/*
+ * Unified accessors for registers that have a different encoding
+ * between VHE and non-VHE. They must be specified without their "ELx"
+ * encoding, but with the SYS_ prefix, as defined in asm/sysreg.h.
+ */
+
+#if defined(__KVM_VHE_HYPERVISOR__)
+
+#define read_sysreg_el0(r) read_sysreg_s(r##_EL02)
+#define write_sysreg_el0(v,r) write_sysreg_s(v, r##_EL02)
+#define read_sysreg_el1(r) read_sysreg_s(r##_EL12)
+#define write_sysreg_el1(v,r) write_sysreg_s(v, r##_EL12)
+#define read_sysreg_el2(r) read_sysreg_s(r##_EL1)
+#define write_sysreg_el2(v,r) write_sysreg_s(v, r##_EL1)
+
+#else // !__KVM_VHE_HYPERVISOR__
+
+#if defined(__KVM_NVHE_HYPERVISOR__)
+#define VHE_ALT_KEY ARM64_KVM_HVHE
+#else
+#define VHE_ALT_KEY ARM64_HAS_VIRT_HOST_EXTN
+#endif
+
#define read_sysreg_elx(r,nvh,vh) \
({ \
u64 reg; \
- asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \
+ asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \
__mrs_s("%0", r##vh), \
- ARM64_HAS_VIRT_HOST_EXTN) \
+ VHE_ALT_KEY) \
: "=r" (reg)); \
reg; \
})
@@ -31,16 +54,10 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
u64 __val = (u64)(v); \
asm volatile(ALTERNATIVE(__msr_s(r##nvh, "%x0"), \
__msr_s(r##vh, "%x0"), \
- ARM64_HAS_VIRT_HOST_EXTN) \
+ VHE_ALT_KEY) \
: : "rZ" (__val)); \
} while (0)
-/*
- * Unified accessors for registers that have a different encoding
- * between VHE and non-VHE. They must be specified without their "ELx"
- * encoding, but with the SYS_ prefix, as defined in asm/sysreg.h.
- */
-
#define read_sysreg_el0(r) read_sysreg_elx(r, _EL0, _EL02)
#define write_sysreg_el0(v,r) write_sysreg_elx(v, r, _EL0, _EL02)
#define read_sysreg_el1(r) read_sysreg_elx(r, _EL1, _EL12)
@@ -48,6 +65,8 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
#define read_sysreg_el2(r) read_sysreg_elx(r, _EL2, _EL1)
#define write_sysreg_el2(v,r) write_sysreg_elx(v, r, _EL2, _EL1)
+#endif // __KVM_VHE_HYPERVISOR__
+
/*
* Without an __arch_swab32(), we fall back to ___constant_swab32(), but the
* static inline can allow the compiler to out-of-line this. KVM always wants
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 27e63c111f78..0e1e1ab17b4d 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -172,6 +172,7 @@ void __init free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
+void kvm_uninit_stage2_mmu(struct kvm *kvm);
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable);
@@ -227,7 +228,8 @@ static inline void __invalidate_icache_guest_page(void *va, size_t size)
if (icache_is_aliasing()) {
/* any kind of VIPT cache */
icache_inval_all_pou();
- } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
+ } else if (read_sysreg(CurrentEL) != CurrentEL_EL1 ||
+ !icache_is_vpipt()) {
/* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */
icache_inval_pou((unsigned long)va, (unsigned long)va + size);
}
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 93bd0975b15f..8294a9a7e566 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -92,6 +92,24 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL;
}
+static inline u32 kvm_supported_block_sizes(void)
+{
+ u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL;
+ u32 r = 0;
+
+ for (; level < KVM_PGTABLE_MAX_LEVELS; level++)
+ r |= BIT(kvm_granule_shift(level));
+
+ return r;
+}
+
+static inline bool kvm_is_block_size_supported(u64 size)
+{
+ bool is_power_of_two = IS_ALIGNED(size, size);
+
+ return is_power_of_two && (size & kvm_supported_block_sizes());
+}
+
/**
* struct kvm_pgtable_mm_ops - Memory management callbacks.
* @zalloc_page: Allocate a single zeroed memory page.
@@ -104,7 +122,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
* allocation is physically contiguous.
* @free_pages_exact: Free an exact number of memory pages previously
* allocated by zalloc_pages_exact.
- * @free_removed_table: Free a removed paging structure by unlinking and
+ * @free_unlinked_table: Free an unlinked paging structure by unlinking and
* dropping references.
* @get_page: Increment the refcount on a page.
* @put_page: Decrement the refcount on a page. When the
@@ -124,7 +142,7 @@ struct kvm_pgtable_mm_ops {
void* (*zalloc_page)(void *arg);
void* (*zalloc_pages_exact)(size_t size);
void (*free_pages_exact)(void *addr, size_t size);
- void (*free_removed_table)(void *addr, u32 level);
+ void (*free_unlinked_table)(void *addr, u32 level);
void (*get_page)(void *addr);
void (*put_page)(void *addr);
int (*page_count)(void *addr);
@@ -195,6 +213,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
* with other software walkers.
* @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was
* invoked from a fault handler.
+ * @KVM_PGTABLE_WALK_SKIP_BBM_TLBI: Visit and update table entries
+ * without Break-before-make's
+ * TLB invalidation.
+ * @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries
+ * without Cache maintenance
+ * operations required.
*/
enum kvm_pgtable_walk_flags {
KVM_PGTABLE_WALK_LEAF = BIT(0),
@@ -202,6 +226,8 @@ enum kvm_pgtable_walk_flags {
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
KVM_PGTABLE_WALK_SHARED = BIT(3),
KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4),
+ KVM_PGTABLE_WALK_SKIP_BBM_TLBI = BIT(5),
+ KVM_PGTABLE_WALK_SKIP_CMO = BIT(6),
};
struct kvm_pgtable_visit_ctx {
@@ -441,7 +467,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
/**
- * kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure.
+ * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
* @mm_ops: Memory management callbacks.
* @pgtable: Unlinked stage-2 paging structure to be freed.
* @level: Level of the stage-2 paging structure to be freed.
@@ -449,7 +475,33 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
* The page-table is assumed to be unreachable by any hardware walkers prior to
* freeing and therefore no TLB invalidation is performed.
*/
-void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);
+void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);
+
+/**
+ * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
+ * @phys: Physical address of the memory to map.
+ * @level: Starting level of the stage-2 paging structure to be created.
+ * @prot: Permissions and attributes for the mapping.
+ * @mc: Cache of pre-allocated and zeroed memory from which to allocate
+ * page-table pages.
+ * @force_pte: Force mappings to PAGE_SIZE granularity.
+ *
+ * Returns an unlinked page-table tree. This new page-table tree is
+ * not reachable (i.e., it is unlinked) from the root pgd and it's
+ * therefore unreachableby the hardware page-table walker. No TLB
+ * invalidation or CMOs are performed.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable.
+ *
+ * Return: The fully populated (unlinked) stage-2 paging structure, or
+ * an ERR_PTR(error) on failure.
+ */
+kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
+ u64 phys, u32 level,
+ enum kvm_pgtable_prot prot,
+ void *mc, bool force_pte);
/**
* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
@@ -621,6 +673,25 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
/**
+ * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
+ * to PAGE_SIZE guest pages.
+ * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr: Intermediate physical address from which to split.
+ * @size: Size of the range.
+ * @mc: Cache of pre-allocated and zeroed memory from which to allocate
+ * page-table pages.
+ *
+ * The function tries to split any level 1 or 2 entry that overlaps
+ * with the input range (given by @addr and @size).
+ *
+ * Return: 0 on success, negative error code on failure. Note that
+ * kvm_pgtable_stage2_split() is best effort: it tries to break as many
+ * blocks in the input range as allowed by @mc_capacity.
+ */
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_mmu_memory_cache *mc);
+
+/**
* kvm_pgtable_walk() - Walk a page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_*_init().
* @addr: Input address for the start of the walk.
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 01129b0d4c68..e46250a02017 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -6,7 +6,9 @@
#ifndef __ARM64_KVM_PKVM_H__
#define __ARM64_KVM_PKVM_H__
+#include <linux/arm_ffa.h>
#include <linux/memblock.h>
+#include <linux/scatterlist.h>
#include <asm/kvm_pgtable.h>
/* Maximum number of VMs that can co-exist under pKVM. */
@@ -106,4 +108,23 @@ static inline unsigned long host_s2_pgtable_pages(void)
return res;
}
+#define KVM_FFA_MBOX_NR_PAGES 1
+
+static inline unsigned long hyp_ffa_proxy_pages(void)
+{
+ size_t desc_max;
+
+ /*
+ * The hypervisor FFA proxy needs enough memory to buffer a fragmented
+ * descriptor returned from EL3 in response to a RETRIEVE_REQ call.
+ */
+ desc_max = sizeof(struct ffa_mem_region) +
+ sizeof(struct ffa_mem_region_attributes) +
+ sizeof(struct ffa_composite_mem_region) +
+ SG_MAX_SEGMENTS * sizeof(struct ffa_mem_region_addr_range);
+
+ /* Plus a page each for the hypervisor's RX and TX mailboxes. */
+ return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE);
+}
+
#endif /* __ARM64_KVM_PKVM_H__ */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 7a1e62631814..b481935e9314 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -510,6 +510,7 @@
(BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \
(BIT(29)))
+#define SCTLR_EL2_BT (BIT(36))
#ifdef CONFIG_CPU_BIG_ENDIAN
#define ENDIAN_SET_EL2 SCTLR_ELx_EE
#else
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 4eb601e7de50..5227db7640c8 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -110,8 +110,10 @@ static inline bool is_hyp_mode_mismatched(void)
return __boot_cpu_mode[0] != __boot_cpu_mode[1];
}
-static inline bool is_kernel_in_hyp_mode(void)
+static __always_inline bool is_kernel_in_hyp_mode(void)
{
+ BUILD_BUG_ON(__is_defined(__KVM_NVHE_HYPERVISOR__) ||
+ __is_defined(__KVM_VHE_HYPERVISOR__));
return read_sysreg(CurrentEL) == CurrentEL_EL2;
}
@@ -140,6 +142,14 @@ static __always_inline bool is_protected_kvm_enabled(void)
return cpus_have_final_cap(ARM64_KVM_PROTECTED_MODE);
}
+static __always_inline bool has_hvhe(void)
+{
+ if (is_vhe_hyp_code())
+ return false;
+
+ return cpus_have_final_cap(ARM64_KVM_HVHE);
+}
+
static inline bool is_hyp_nvhe(void)
{
return is_hyp_mode_available() && !is_kernel_in_hyp_mode();
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 307faa2b4395..be66e94a21bd 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -730,6 +730,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
.cpu_enable = cpu_clear_bf16_from_user_emulation,
},
#endif
+#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38
+ {
+ .desc = "AmpereOne erratum AC03_CPU_38",
+ .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38,
+ ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1),
+ },
+#endif
{
}
};
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6ea7f23b1287..f9d456fe132d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -672,6 +672,8 @@ struct arm64_ftr_override __ro_after_init id_aa64smfr0_override;
struct arm64_ftr_override __ro_after_init id_aa64isar1_override;
struct arm64_ftr_override __ro_after_init id_aa64isar2_override;
+struct arm64_ftr_override arm64_sw_feature_override;
+
static const struct __ftr_reg_entry {
u32 sys_id;
struct arm64_ftr_reg *reg;
@@ -807,7 +809,7 @@ static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg,
return reg;
}
-static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
+s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
s64 cur)
{
s64 ret = 0;
@@ -2009,6 +2011,19 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap,
return true;
}
+static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
+ int __unused)
+{
+ u64 val;
+
+ val = read_sysreg(id_aa64mmfr1_el1);
+ if (!cpuid_feature_extract_unsigned_field(val, ID_AA64MMFR1_EL1_VH_SHIFT))
+ return false;
+
+ val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask;
+ return cpuid_feature_extract_unsigned_field(val, ARM64_SW_FEATURE_OVERRIDE_HVHE);
+}
+
#ifdef CONFIG_ARM64_PAN
static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
{
@@ -2683,6 +2698,23 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_cpuid_feature,
ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1PIE, IMP)
},
+ {
+ .desc = "VHE for hypervisor only",
+ .capability = ARM64_KVM_HVHE,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = hvhe_possible,
+ },
+ {
+ .desc = "Enhanced Virtualization Traps",
+ .capability = ARM64_HAS_EVT,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .sys_reg = SYS_ID_AA64MMFR2_EL1,
+ .sign = FTR_UNSIGNED,
+ .field_pos = ID_AA64MMFR2_EL1_EVT_SHIFT,
+ .field_width = 4,
+ .min_field_value = ID_AA64MMFR2_EL1_EVT_IMP,
+ .matches = has_cpuid_feature,
+ },
{},
};
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 0f5a30f109d9..757a0de07f91 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -603,6 +603,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
msr sctlr_el1, x1
mov x2, xzr
2:
+ __init_el2_nvhe_prepare_eret
+
mov w0, #BOOT_CPU_MODE_EL2
orr x0, x0, x2
eret
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index d63de1973ddb..65f76064c86b 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -82,7 +82,15 @@ SYM_CODE_START_LOCAL(__finalise_el2)
tbnz x1, #0, 1f
// Needs to be VHE capable, obviously
- check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f x1 x2
+ check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 0f 1f x1 x2
+
+0: // Check whether we only want the hypervisor to run VHE, not the kernel
+ adr_l x1, arm64_sw_feature_override
+ ldr x2, [x1, FTR_OVR_VAL_OFFSET]
+ ldr x1, [x1, FTR_OVR_MASK_OFFSET]
+ and x2, x2, x1
+ ubfx x2, x2, #ARM64_SW_FEATURE_OVERRIDE_HVHE, #4
+ cbz x2, 2f
1: mov_q x0, HVC_STUB_ERR
eret
diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c
index 8439248c21d3..2fe2491b692c 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -139,15 +139,22 @@ static const struct ftr_set_desc smfr0 __initconst = {
},
};
-extern struct arm64_ftr_override kaslr_feature_override;
+static bool __init hvhe_filter(u64 val)
+{
+ u64 mmfr1 = read_sysreg(id_aa64mmfr1_el1);
+
+ return (val == 1 &&
+ lower_32_bits(__boot_status) == BOOT_CPU_MODE_EL2 &&
+ cpuid_feature_extract_unsigned_field(mmfr1,
+ ID_AA64MMFR1_EL1_VH_SHIFT));
+}
-static const struct ftr_set_desc kaslr __initconst = {
- .name = "kaslr",
-#ifdef CONFIG_RANDOMIZE_BASE
- .override = &kaslr_feature_override,
-#endif
+static const struct ftr_set_desc sw_features __initconst = {
+ .name = "arm64_sw",
+ .override = &arm64_sw_feature_override,
.fields = {
- FIELD("disabled", 0, NULL),
+ FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL),
+ FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter),
{}
},
};
@@ -159,7 +166,7 @@ static const struct ftr_set_desc * const regs[] __initconst = {
&isar1,
&isar2,
&smfr0,
- &kaslr,
+ &sw_features,
};
static const struct {
@@ -177,7 +184,7 @@ static const struct {
"id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" },
{ "arm64.nomops", "id_aa64isar2.mops=0" },
{ "arm64.nomte", "id_aa64pfr1.mte=0" },
- { "nokaslr", "kaslr.disabled=1" },
+ { "nokaslr", "arm64_sw.nokaslr=1" },
};
static int __init parse_nokaslr(char *unused)
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 17f96a19781d..94a269cd1f07 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -12,13 +12,13 @@
u16 __initdata memstart_offset_seed;
-struct arm64_ftr_override kaslr_feature_override __initdata;
-
bool __ro_after_init __kaslr_is_enabled = false;
void __init kaslr_init(void)
{
- if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) {
+ if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val &
+ arm64_sw_feature_override.mask,
+ ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) {
pr_info("KASLR disabled on command line\n");
return;
}
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 05b022be885b..0696732fa38c 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1406,7 +1406,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
kvm_get_running_vcpus());
if (err) {
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
- goto out_free_irq;
+ goto out_free_vtimer_irq;
}
static_branch_enable(&has_gic_active_state);
@@ -1422,7 +1422,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
if (err) {
kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
host_ptimer_irq, err);
- return err;
+ goto out_free_vtimer_irq;
}
if (has_gic) {
@@ -1430,7 +1430,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
kvm_get_running_vcpus());
if (err) {
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
- goto out_free_irq;
+ goto out_free_ptimer_irq;
}
}
@@ -1439,11 +1439,15 @@ int __init kvm_timer_hyp_init(bool has_gic)
kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
info->physical_irq);
err = -ENODEV;
- goto out_free_irq;
+ goto out_free_vtimer_irq;
}
return 0;
-out_free_irq:
+
+out_free_ptimer_irq:
+ if (info->physical_irq > 0)
+ free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus());
+out_free_vtimer_irq:
free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
return err;
}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 14391826241c..c2c14059f6a8 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -51,6 +51,8 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+
static bool vgic_present;
static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
@@ -65,6 +67,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
int r;
+ u64 new_cap;
if (cap->flags)
return -EINVAL;
@@ -89,6 +92,24 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
break;
+ case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
+ new_cap = cap->args[0];
+
+ mutex_lock(&kvm->slots_lock);
+ /*
+ * To keep things simple, allow changing the chunk
+ * size only when no memory slots have been created.
+ */
+ if (!kvm_are_all_memslots_empty(kvm)) {
+ r = -EINVAL;
+ } else if (new_cap && !kvm_is_block_size_supported(new_cap)) {
+ r = -EINVAL;
+ } else {
+ r = 0;
+ kvm->arch.mmu.split_page_chunk_size = new_cap;
+ }
+ mutex_unlock(&kvm->slots_lock);
+ break;
default:
r = -EINVAL;
break;
@@ -102,22 +123,6 @@ static int kvm_arm_default_max_vcpus(void)
return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
}
-static void set_default_spectre(struct kvm *kvm)
-{
- /*
- * The default is to expose CSV2 == 1 if the HW isn't affected.
- * Although this is a per-CPU feature, we make it global because
- * asymmetric systems are just a nuisance.
- *
- * Userspace can override this as long as it doesn't promise
- * the impossible.
- */
- if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
- kvm->arch.pfr0_csv2 = 1;
- if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED)
- kvm->arch.pfr0_csv3 = 1;
-}
-
/**
* kvm_arch_init_vm - initializes a VM data structure
* @kvm: pointer to the KVM struct
@@ -161,14 +166,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* The maximum number of VCPUs is limited by the host's GIC model */
kvm->max_vcpus = kvm_arm_default_max_vcpus();
- set_default_spectre(kvm);
kvm_arm_init_hypercalls(kvm);
- /*
- * Initialise the default PMUver before there is a chance to
- * create an actual PMU.
- */
- kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit();
+ bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
return 0;
@@ -302,6 +302,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_PTRAUTH_GENERIC:
r = system_has_full_ptr_auth();
break;
+ case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
+ if (kvm)
+ r = kvm->arch.mmu.split_page_chunk_size;
+ else
+ r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ break;
+ case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
+ r = kvm_supported_block_sizes();
+ break;
default:
r = 0;
}
@@ -1167,58 +1176,115 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
return -EINVAL;
}
-static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
- const struct kvm_vcpu_init *init)
+static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
{
- unsigned int i, ret;
- u32 phys_target = kvm_target_cpu();
+ unsigned long features = init->features[0];
+ int i;
+
+ if (features & ~KVM_VCPU_VALID_FEATURES)
+ return -ENOENT;
+
+ for (i = 1; i < ARRAY_SIZE(init->features); i++) {
+ if (init->features[i])
+ return -ENOENT;
+ }
+
+ if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
+ return 0;
- if (init->target != phys_target)
+ if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1))
return -EINVAL;
- /*
- * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
- * use the same target.
- */
- if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
+ /* MTE is incompatible with AArch32 */
+ if (kvm_has_mte(vcpu->kvm))
return -EINVAL;
- /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
- for (i = 0; i < sizeof(init->features) * 8; i++) {
- bool set = (init->features[i / 32] & (1 << (i % 32)));
+ /* NV is incompatible with AArch32 */
+ if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
+ return -EINVAL;
- if (set && i >= KVM_VCPU_MAX_FEATURES)
- return -ENOENT;
+ return 0;
+}
- /*
- * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
- * use the same feature set.
- */
- if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
- test_bit(i, vcpu->arch.features) != set)
- return -EINVAL;
+static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ unsigned long features = init->features[0];
- if (set)
- set_bit(i, vcpu->arch.features);
- }
+ return !bitmap_equal(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES) ||
+ vcpu->arch.target != init->target;
+}
+
+static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ unsigned long features = init->features[0];
+ struct kvm *kvm = vcpu->kvm;
+ int ret = -EINVAL;
+
+ mutex_lock(&kvm->arch.config_lock);
- vcpu->arch.target = phys_target;
+ if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
+ !bitmap_equal(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES))
+ goto out_unlock;
+
+ vcpu->arch.target = init->target;
+ bitmap_copy(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES);
/* Now we know what it is, we can reset it. */
ret = kvm_reset_vcpu(vcpu);
if (ret) {
vcpu->arch.target = -1;
bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+ goto out_unlock;
}
+ bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);
+ set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
+
+out_unlock:
+ mutex_unlock(&kvm->arch.config_lock);
return ret;
}
+static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ int ret;
+
+ if (init->target != kvm_target_cpu())
+ return -EINVAL;
+
+ ret = kvm_vcpu_init_check_features(vcpu, init);
+ if (ret)
+ return ret;
+
+ if (vcpu->arch.target == -1)
+ return __kvm_vcpu_set_target(vcpu, init);
+
+ if (kvm_vcpu_init_changed(vcpu, init))
+ return -EINVAL;
+
+ return kvm_reset_vcpu(vcpu);
+}
+
static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
struct kvm_vcpu_init *init)
{
+ bool power_off = false;
int ret;
+ /*
+ * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
+ * reflecting it in the finalized feature set, thus limiting its scope
+ * to a single KVM_ARM_VCPU_INIT call.
+ */
+ if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
+ init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
+ power_off = true;
+ }
+
ret = kvm_vcpu_set_target(vcpu, init);
if (ret)
return ret;
@@ -1240,14 +1306,14 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
}
vcpu_reset_hcr(vcpu);
- vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
+ vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
/*
* Handle the "start in power-off" case.
*/
spin_lock(&vcpu->arch.mp_state_lock);
- if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
+ if (power_off)
__kvm_arm_vcpu_power_off(vcpu);
else
WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
@@ -1666,7 +1732,13 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
params->mair_el2 = read_sysreg(mair_el1);
- tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1;
+ tcr = read_sysreg(tcr_el1);
+ if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
+ tcr |= TCR_EPD1_MASK;
+ } else {
+ tcr &= TCR_EL2_MASK;
+ tcr |= TCR_EL2_RES1;
+ }
tcr &= ~TCR_T0SZ_MASK;
tcr |= TCR_T0SZ(hyp_va_bits);
params->tcr_el2 = tcr;
@@ -1676,6 +1748,8 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
else
params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
+ if (cpus_have_final_cap(ARM64_KVM_HVHE))
+ params->hcr_el2 |= HCR_E2H;
params->vttbr = params->vtcr = 0;
/*
@@ -1910,6 +1984,7 @@ static bool __init init_psci_relay(void)
}
kvm_host_psci_config.version = psci_ops.get_version();
+ kvm_host_psci_config.smccc_version = arm_smccc_get_version();
if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
@@ -2067,6 +2142,26 @@ static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
return 0;
}
+static void pkvm_hyp_init_ptrauth(void)
+{
+ struct kvm_cpu_context *hyp_ctxt;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
+ hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
+ hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
+ }
+}
+
/* Inits Hyp-mode on all online CPUs */
static int __init init_hyp_mode(void)
{
@@ -2228,6 +2323,10 @@ static int __init init_hyp_mode(void)
kvm_hyp_init_symbols();
if (is_protected_kvm_enabled()) {
+ if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
+ cpus_have_const_cap(ARM64_HAS_ADDRESS_AUTH))
+ pkvm_hyp_init_ptrauth();
+
init_cpu_logical_map();
if (!init_psci_relay()) {
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 4c9dcd8fc939..8c1d0d4853df 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -180,7 +180,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
/*
* If we have VHE then the Hyp code will reset CPACR_EL1 to
- * CPACR_EL1_DEFAULT and we need to reenable SME.
+ * the default value and we need to reenable SME.
*/
if (has_vhe() && system_supports_sme()) {
/* Also restore EL0 state seen on entry */
@@ -210,7 +210,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
/*
* The FPSIMD/SVE state in the CPU has not been touched, and we
* have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
- * reset to CPACR_EL1_DEFAULT by the Hyp code, disabling SVE
+ * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE
* for EL0. To avoid spurious traps, restore the trap state
* seen by kvm_arch_vcpu_load_fp():
*/
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 2f6e0b3e4a75..4bddb8541bec 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -70,6 +70,56 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
}
}
+static inline bool __hfgxtr_traps_required(void)
+{
+ if (cpus_have_final_cap(ARM64_SME))
+ return true;
+
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ return true;
+
+ return false;
+}
+
+static inline void __activate_traps_hfgxtr(void)
+{
+ u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp;
+
+ if (cpus_have_final_cap(ARM64_SME)) {
+ tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK;
+
+ r_clr |= tmp;
+ w_clr |= tmp;
+ }
+
+ /*
+ * Trap guest writes to TCR_EL1 to prevent it from enabling HA or HD.
+ */
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ w_set |= HFGxTR_EL2_TCR_EL1_MASK;
+
+ sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set);
+ sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set);
+}
+
+static inline void __deactivate_traps_hfgxtr(void)
+{
+ u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp;
+
+ if (cpus_have_final_cap(ARM64_SME)) {
+ tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK;
+
+ r_set |= tmp;
+ w_set |= tmp;
+ }
+
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ w_clr |= HFGxTR_EL2_TCR_EL1_MASK;
+
+ sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set);
+ sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set);
+}
+
static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
{
/* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
@@ -95,16 +145,8 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
- if (cpus_have_final_cap(ARM64_SME)) {
- sysreg_clear_set_s(SYS_HFGRTR_EL2,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK,
- 0);
- sysreg_clear_set_s(SYS_HFGWTR_EL2,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK,
- 0);
- }
+ if (__hfgxtr_traps_required())
+ __activate_traps_hfgxtr();
}
static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
@@ -120,14 +162,8 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU);
}
- if (cpus_have_final_cap(ARM64_SME)) {
- sysreg_clear_set_s(SYS_HFGRTR_EL2, 0,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK);
- sysreg_clear_set_s(SYS_HFGWTR_EL2, 0,
- HFGxTR_EL2_nSMPRI_EL1_MASK |
- HFGxTR_EL2_nTPIDR2_EL0_MASK);
- }
+ if (__hfgxtr_traps_required())
+ __deactivate_traps_hfgxtr();
}
static inline void ___activate_traps(struct kvm_vcpu *vcpu)
@@ -209,7 +245,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
/* Valid trap. Switch the context: */
/* First disable enough traps to allow us to update the registers */
- if (has_vhe()) {
+ if (has_vhe() || has_hvhe()) {
reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN;
if (sve_guest)
reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
@@ -401,12 +437,39 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu)
return true;
}
+static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu)
+{
+ u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
+ int rt = kvm_vcpu_sys_get_rt(vcpu);
+ u64 val = vcpu_get_reg(vcpu, rt);
+
+ if (sysreg != SYS_TCR_EL1)
+ return false;
+
+ /*
+ * Affected parts do not advertise support for hardware Access Flag /
+ * Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying
+ * control bits are still functional. The architecture requires these be
+ * RES0 on systems that do not implement FEAT_HAFDBS.
+ *
+ * Uphold the requirements of the architecture by masking guest writes
+ * to TCR_EL1.{HA,HD} here.
+ */
+ val &= ~(TCR_HD | TCR_HA);
+ write_sysreg_el1(val, SYS_TCR);
+ return true;
+}
+
static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
handle_tx2_tvm(vcpu))
return true;
+ if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) &&
+ handle_ampere1_tcr(vcpu))
+ return true;
+
if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
__vgic_v3_perform_cpuif_access(vcpu) == 1)
return true;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
new file mode 100644
index 000000000000..1becb10ecd80
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ */
+#ifndef __KVM_HYP_FFA_H
+#define __KVM_HYP_FFA_H
+
+#include <asm/kvm_host.h>
+
+#define FFA_MIN_FUNC_NUM 0x60
+#define FFA_MAX_FUNC_NUM 0x7F
+
+int hyp_ffa_init(void *pages);
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt);
+
+#endif /* __KVM_HYP_FFA_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index b7bdbe63deed..0972faccc2af 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -57,6 +57,7 @@ extern struct host_mmu host_mmu;
enum pkvm_component_id {
PKVM_ID_HOST,
PKVM_ID_HYP,
+ PKVM_ID_FFA,
};
extern unsigned long hyp_nr_cpus;
@@ -66,6 +67,8 @@ int __pkvm_host_share_hyp(u64 pfn);
int __pkvm_host_unshare_hyp(u64 pfn);
int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
bool addr_is_memory(phys_addr_t phys);
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 530347cdebe3..9ddc025e4b86 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -22,7 +22,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
- cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o
+ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
new file mode 100644
index 000000000000..58dcd92bf346
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by
+ * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware
+ * Framework for Arm A-profile", which is specified by Arm in document
+ * number DEN0077.
+ *
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ *
+ * This driver hooks into the SMC trapping logic for the host and intercepts
+ * all calls falling within the FF-A range. Each call is either:
+ *
+ * - Forwarded on unmodified to the SPMD at EL3
+ * - Rejected as "unsupported"
+ * - Accompanied by a host stage-2 page-table check/update and reissued
+ *
+ * Consequently, any attempts by the host to make guest memory pages
+ * accessible to the secure world using FF-A will be detected either here
+ * (in the case that the memory is already owned by the guest) or during
+ * donation to the guest (in the case that the memory was previously shared
+ * with the secure world).
+ *
+ * To allow the rolling-back of page-table updates and FF-A calls in the
+ * event of failure, operations involving the RXTX buffers are locked for
+ * the duration and are therefore serialised.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/arm_ffa.h>
+#include <asm/kvm_pkvm.h>
+
+#include <nvhe/ffa.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
+#include <nvhe/trap_handler.h>
+#include <nvhe/spinlock.h>
+
+/*
+ * "ID value 0 must be returned at the Non-secure physical FF-A instance"
+ * We share this ID with the host.
+ */
+#define HOST_FFA_ID 0
+
+/*
+ * A buffer to hold the maximum descriptor size we can see from the host,
+ * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP
+ * when resolving the handle on the reclaim path.
+ */
+struct kvm_ffa_descriptor_buffer {
+ void *buf;
+ size_t len;
+};
+
+static struct kvm_ffa_descriptor_buffer ffa_desc_buf;
+
+struct kvm_ffa_buffers {
+ hyp_spinlock_t lock;
+ void *tx;
+ void *rx;
+};
+
+/*
+ * Note that we don't currently lock these buffers explicitly, instead
+ * relying on the locking of the host FFA buffers as we only have one
+ * client.
+ */
+static struct kvm_ffa_buffers hyp_buffers;
+static struct kvm_ffa_buffers host_buffers;
+
+static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
+{
+ *res = (struct arm_smccc_res) {
+ .a0 = FFA_ERROR,
+ .a2 = ffa_errno,
+ };
+}
+
+static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop)
+{
+ if (ret == FFA_RET_SUCCESS) {
+ *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS,
+ .a2 = prop };
+ } else {
+ ffa_to_smccc_error(res, ret);
+ }
+}
+
+static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret)
+{
+ ffa_to_smccc_res_prop(res, ret, 0);
+}
+
+static void ffa_set_retval(struct kvm_cpu_context *ctxt,
+ struct arm_smccc_res *res)
+{
+ cpu_reg(ctxt, 0) = res->a0;
+ cpu_reg(ctxt, 1) = res->a1;
+ cpu_reg(ctxt, 2) = res->a2;
+ cpu_reg(ctxt, 3) = res->a3;
+}
+
+static bool is_ffa_call(u64 func_id)
+{
+ return ARM_SMCCC_IS_FAST_CALL(func_id) &&
+ ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD &&
+ ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM &&
+ ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM;
+}
+
+static int ffa_map_hyp_buffers(u64 ffa_page_count)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP,
+ hyp_virt_to_phys(hyp_buffers.tx),
+ hyp_virt_to_phys(hyp_buffers.rx),
+ ffa_page_count,
+ 0, 0, 0, 0,
+ &res);
+
+ return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static int ffa_unmap_hyp_buffers(void)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_smc(FFA_RXTX_UNMAP,
+ HOST_FFA_ID,
+ 0, 0, 0, 0, 0, 0,
+ &res);
+
+ return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo,
+ u32 handle_hi, u32 fraglen, u32 endpoint_id)
+{
+ arm_smccc_1_1_smc(FFA_MEM_FRAG_TX,
+ handle_lo, handle_hi, fraglen, endpoint_id,
+ 0, 0, 0,
+ res);
+}
+
+static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo,
+ u32 handle_hi, u32 fragoff)
+{
+ arm_smccc_1_1_smc(FFA_MEM_FRAG_RX,
+ handle_lo, handle_hi, fragoff, HOST_FFA_ID,
+ 0, 0, 0,
+ res);
+}
+
+static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len,
+ u32 fraglen)
+{
+ arm_smccc_1_1_smc(func_id, len, fraglen,
+ 0, 0, 0, 0, 0,
+ res);
+}
+
+static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo,
+ u32 handle_hi, u32 flags)
+{
+ arm_smccc_1_1_smc(FFA_MEM_RECLAIM,
+ handle_lo, handle_hi, flags,
+ 0, 0, 0, 0,
+ res);
+}
+
+static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len)
+{
+ arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ,
+ len, len,
+ 0, 0, 0, 0, 0,
+ res);
+}
+
+static void do_ffa_rxtx_map(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(phys_addr_t, tx, ctxt, 1);
+ DECLARE_REG(phys_addr_t, rx, ctxt, 2);
+ DECLARE_REG(u32, npages, ctxt, 3);
+ int ret = 0;
+ void *rx_virt, *tx_virt;
+
+ if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (host_buffers.tx) {
+ ret = FFA_RET_DENIED;
+ goto out_unlock;
+ }
+
+ /*
+ * Map our hypervisor buffers into the SPMD before mapping and
+ * pinning the host buffers in our own address space.
+ */
+ ret = ffa_map_hyp_buffers(npages);
+ if (ret)
+ goto out_unlock;
+
+ ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx));
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unmap;
+ }
+
+ ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx));
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unshare_tx;
+ }
+
+ tx_virt = hyp_phys_to_virt(tx);
+ ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1);
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unshare_rx;
+ }
+
+ rx_virt = hyp_phys_to_virt(rx);
+ ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1);
+ if (ret) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto err_unpin_tx;
+ }
+
+ host_buffers.tx = tx_virt;
+ host_buffers.rx = rx_virt;
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ ffa_to_smccc_res(res, ret);
+ return;
+
+err_unpin_tx:
+ hyp_unpin_shared_mem(tx_virt, tx_virt + 1);
+err_unshare_rx:
+ __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx));
+err_unshare_tx:
+ __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx));
+err_unmap:
+ ffa_unmap_hyp_buffers();
+ goto out_unlock;
+}
+
+static void do_ffa_rxtx_unmap(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, id, ctxt, 1);
+ int ret = 0;
+
+ if (id != HOST_FFA_ID) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1);
+ WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx)));
+ host_buffers.tx = NULL;
+
+ hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1);
+ WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx)));
+ host_buffers.rx = NULL;
+
+ ffa_unmap_hyp_buffers();
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ ffa_to_smccc_res(res, ret);
+}
+
+static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 i;
+
+ for (i = 0; i < nranges; ++i) {
+ struct ffa_mem_region_addr_range *range = &ranges[i];
+ u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+ u64 pfn = hyp_phys_to_pfn(range->address);
+
+ if (!PAGE_ALIGNED(sz))
+ break;
+
+ if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE))
+ break;
+ }
+
+ return i;
+}
+
+static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 i;
+
+ for (i = 0; i < nranges; ++i) {
+ struct ffa_mem_region_addr_range *range = &ranges[i];
+ u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+ u64 pfn = hyp_phys_to_pfn(range->address);
+
+ if (!PAGE_ALIGNED(sz))
+ break;
+
+ if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE))
+ break;
+ }
+
+ return i;
+}
+
+static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 nshared = __ffa_host_share_ranges(ranges, nranges);
+ int ret = 0;
+
+ if (nshared != nranges) {
+ WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared);
+ ret = FFA_RET_DENIED;
+ }
+
+ return ret;
+}
+
+static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+ u32 nranges)
+{
+ u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges);
+ int ret = 0;
+
+ if (nunshared != nranges) {
+ WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared);
+ ret = FFA_RET_DENIED;
+ }
+
+ return ret;
+}
+
+static void do_ffa_mem_frag_tx(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, handle_lo, ctxt, 1);
+ DECLARE_REG(u32, handle_hi, ctxt, 2);
+ DECLARE_REG(u32, fraglen, ctxt, 3);
+ DECLARE_REG(u32, endpoint_id, ctxt, 4);
+ struct ffa_mem_region_addr_range *buf;
+ int ret = FFA_RET_INVALID_PARAMETERS;
+ u32 nr_ranges;
+
+ if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)
+ goto out;
+
+ if (fraglen % sizeof(*buf))
+ goto out;
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx)
+ goto out_unlock;
+
+ buf = hyp_buffers.tx;
+ memcpy(buf, host_buffers.tx, fraglen);
+ nr_ranges = fraglen / sizeof(*buf);
+
+ ret = ffa_host_share_ranges(buf, nr_ranges);
+ if (ret) {
+ /*
+ * We're effectively aborting the transaction, so we need
+ * to restore the global state back to what it was prior to
+ * transmission of the first fragment.
+ */
+ ffa_mem_reclaim(res, handle_lo, handle_hi, 0);
+ WARN_ON(res->a0 != FFA_SUCCESS);
+ goto out_unlock;
+ }
+
+ ffa_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id);
+ if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX)
+ WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges));
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+
+ /*
+ * If for any reason this did not succeed, we're in trouble as we have
+ * now lost the content of the previous fragments and we can't rollback
+ * the host stage-2 changes. The pages previously marked as shared will
+ * remain stuck in that state forever, hence preventing the host from
+ * sharing/donating them again and may possibly lead to subsequent
+ * failures, but this will not compromise confidentiality.
+ */
+ return;
+}
+
+static __always_inline void do_ffa_mem_xfer(const u64 func_id,
+ struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, len, ctxt, 1);
+ DECLARE_REG(u32, fraglen, ctxt, 2);
+ DECLARE_REG(u64, addr_mbz, ctxt, 3);
+ DECLARE_REG(u32, npages_mbz, ctxt, 4);
+ struct ffa_composite_mem_region *reg;
+ struct ffa_mem_region *buf;
+ u32 offset, nr_ranges;
+ int ret = 0;
+
+ BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE &&
+ func_id != FFA_FN64_MEM_LEND);
+
+ if (addr_mbz || npages_mbz || fraglen > len ||
+ fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ if (fraglen < sizeof(struct ffa_mem_region) +
+ sizeof(struct ffa_mem_region_attributes)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out;
+ }
+
+ hyp_spin_lock(&host_buffers.lock);
+ if (!host_buffers.tx) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ buf = hyp_buffers.tx;
+ memcpy(buf, host_buffers.tx, fraglen);
+
+ offset = buf->ep_mem_access[0].composite_off;
+ if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ reg = (void *)buf + offset;
+ nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents;
+ if (nr_ranges % sizeof(reg->constituents[0])) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ nr_ranges /= sizeof(reg->constituents[0]);
+ ret = ffa_host_share_ranges(reg->constituents, nr_ranges);
+ if (ret)
+ goto out_unlock;
+
+ ffa_mem_xfer(res, func_id, len, fraglen);
+ if (fraglen != len) {
+ if (res->a0 != FFA_MEM_FRAG_RX)
+ goto err_unshare;
+
+ if (res->a3 != fraglen)
+ goto err_unshare;
+ } else if (res->a0 != FFA_SUCCESS) {
+ goto err_unshare;
+ }
+
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+out:
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+ return;
+
+err_unshare:
+ WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges));
+ goto out_unlock;
+}
+
+static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, handle_lo, ctxt, 1);
+ DECLARE_REG(u32, handle_hi, ctxt, 2);
+ DECLARE_REG(u32, flags, ctxt, 3);
+ struct ffa_composite_mem_region *reg;
+ u32 offset, len, fraglen, fragoff;
+ struct ffa_mem_region *buf;
+ int ret = 0;
+ u64 handle;
+
+ handle = PACK_HANDLE(handle_lo, handle_hi);
+
+ hyp_spin_lock(&host_buffers.lock);
+
+ buf = hyp_buffers.tx;
+ *buf = (struct ffa_mem_region) {
+ .sender_id = HOST_FFA_ID,
+ .handle = handle,
+ };
+
+ ffa_retrieve_req(res, sizeof(*buf));
+ buf = hyp_buffers.rx;
+ if (res->a0 != FFA_MEM_RETRIEVE_RESP)
+ goto out_unlock;
+
+ len = res->a1;
+ fraglen = res->a2;
+
+ offset = buf->ep_mem_access[0].composite_off;
+ /*
+ * We can trust the SPMD to get this right, but let's at least
+ * check that we end up with something that doesn't look _completely_
+ * bogus.
+ */
+ if (WARN_ON(offset > len ||
+ fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) {
+ ret = FFA_RET_ABORTED;
+ goto out_unlock;
+ }
+
+ if (len > ffa_desc_buf.len) {
+ ret = FFA_RET_NO_MEMORY;
+ goto out_unlock;
+ }
+
+ buf = ffa_desc_buf.buf;
+ memcpy(buf, hyp_buffers.rx, fraglen);
+
+ for (fragoff = fraglen; fragoff < len; fragoff += fraglen) {
+ ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff);
+ if (res->a0 != FFA_MEM_FRAG_TX) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ fraglen = res->a3;
+ memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen);
+ }
+
+ ffa_mem_reclaim(res, handle_lo, handle_hi, flags);
+ if (res->a0 != FFA_SUCCESS)
+ goto out_unlock;
+
+ reg = (void *)buf + offset;
+ /* If the SPMD was happy, then we should be too. */
+ WARN_ON(ffa_host_unshare_ranges(reg->constituents,
+ reg->addr_range_cnt));
+out_unlock:
+ hyp_spin_unlock(&host_buffers.lock);
+
+ if (ret)
+ ffa_to_smccc_res(res, ret);
+}
+
+/*
+ * Is a given FFA function supported, either by forwarding on directly
+ * or by handling at EL2?
+ */
+static bool ffa_call_supported(u64 func_id)
+{
+ switch (func_id) {
+ /* Unsupported memory management calls */
+ case FFA_FN64_MEM_RETRIEVE_REQ:
+ case FFA_MEM_RETRIEVE_RESP:
+ case FFA_MEM_RELINQUISH:
+ case FFA_MEM_OP_PAUSE:
+ case FFA_MEM_OP_RESUME:
+ case FFA_MEM_FRAG_RX:
+ case FFA_FN64_MEM_DONATE:
+ /* Indirect message passing via RX/TX buffers */
+ case FFA_MSG_SEND:
+ case FFA_MSG_POLL:
+ case FFA_MSG_WAIT:
+ /* 32-bit variants of 64-bit calls */
+ case FFA_MSG_SEND_DIRECT_REQ:
+ case FFA_MSG_SEND_DIRECT_RESP:
+ case FFA_RXTX_MAP:
+ case FFA_MEM_DONATE:
+ case FFA_MEM_RETRIEVE_REQ:
+ return false;
+ }
+
+ return true;
+}
+
+static bool do_ffa_features(struct arm_smccc_res *res,
+ struct kvm_cpu_context *ctxt)
+{
+ DECLARE_REG(u32, id, ctxt, 1);
+ u64 prop = 0;
+ int ret = 0;
+
+ if (!ffa_call_supported(id)) {
+ ret = FFA_RET_NOT_SUPPORTED;
+ goto out_handled;
+ }
+
+ switch (id) {
+ case FFA_MEM_SHARE:
+ case FFA_FN64_MEM_SHARE:
+ case FFA_MEM_LEND:
+ case FFA_FN64_MEM_LEND:
+ ret = FFA_RET_SUCCESS;
+ prop = 0; /* No support for dynamic buffers */
+ goto out_handled;
+ default:
+ return false;
+ }
+
+out_handled:
+ ffa_to_smccc_res_prop(res, ret, prop);
+ return true;
+}
+
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(u64, func_id, host_ctxt, 0);
+ struct arm_smccc_res res;
+
+ /*
+ * There's no way we can tell what a non-standard SMC call might
+ * be up to. Ideally, we would terminate these here and return
+ * an error to the host, but sadly devices make use of custom
+ * firmware calls for things like power management, debugging,
+ * RNG access and crash reporting.
+ *
+ * Given that the architecture requires us to trust EL3 anyway,
+ * we forward unrecognised calls on under the assumption that
+ * the firmware doesn't expose a mechanism to access arbitrary
+ * non-secure memory. Short of a per-device table of SMCs, this
+ * is the best we can do.
+ */
+ if (!is_ffa_call(func_id))
+ return false;
+
+ switch (func_id) {
+ case FFA_FEATURES:
+ if (!do_ffa_features(&res, host_ctxt))
+ return false;
+ goto out_handled;
+ /* Memory management */
+ case FFA_FN64_RXTX_MAP:
+ do_ffa_rxtx_map(&res, host_ctxt);
+ goto out_handled;
+ case FFA_RXTX_UNMAP:
+ do_ffa_rxtx_unmap(&res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_SHARE:
+ case FFA_FN64_MEM_SHARE:
+ do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_RECLAIM:
+ do_ffa_mem_reclaim(&res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_LEND:
+ case FFA_FN64_MEM_LEND:
+ do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt);
+ goto out_handled;
+ case FFA_MEM_FRAG_TX:
+ do_ffa_mem_frag_tx(&res, host_ctxt);
+ goto out_handled;
+ }
+
+ if (ffa_call_supported(func_id))
+ return false; /* Pass through */
+
+ ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED);
+out_handled:
+ ffa_set_retval(host_ctxt, &res);
+ return true;
+}
+
+int hyp_ffa_init(void *pages)
+{
+ struct arm_smccc_res res;
+ size_t min_rxtx_sz;
+ void *tx, *rx;
+
+ if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
+ return 0;
+
+ arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 == FFA_RET_NOT_SUPPORTED)
+ return 0;
+
+ if (res.a0 != FFA_VERSION_1_0)
+ return -EOPNOTSUPP;
+
+ arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ if (res.a2 != HOST_FFA_ID)
+ return -EINVAL;
+
+ arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
+ 0, 0, 0, 0, 0, 0, &res);
+ if (res.a0 != FFA_SUCCESS)
+ return -EOPNOTSUPP;
+
+ switch (res.a2) {
+ case FFA_FEAT_RXTX_MIN_SZ_4K:
+ min_rxtx_sz = SZ_4K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_16K:
+ min_rxtx_sz = SZ_16K;
+ break;
+ case FFA_FEAT_RXTX_MIN_SZ_64K:
+ min_rxtx_sz = SZ_64K;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (min_rxtx_sz > PAGE_SIZE)
+ return -EOPNOTSUPP;
+
+ tx = pages;
+ pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+ rx = pages;
+ pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+
+ ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) {
+ .buf = pages,
+ .len = PAGE_SIZE *
+ (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)),
+ };
+
+ hyp_buffers = (struct kvm_ffa_buffers) {
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+ .tx = tx,
+ .rx = rx,
+ };
+
+ host_buffers = (struct kvm_ffa_buffers) {
+ .lock = __HYP_SPIN_LOCK_UNLOCKED,
+ };
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index b6c0188c4b35..c87c63133e10 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -10,6 +10,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_ptrauth.h>
.text
@@ -37,10 +38,43 @@ SYM_FUNC_START(__host_exit)
/* Save the host context pointer in x29 across the function call */
mov x29, x0
+
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+alternative_if_not ARM64_HAS_ADDRESS_AUTH
+b __skip_pauth_save
+alternative_else_nop_endif
+
+alternative_if ARM64_KVM_PROTECTED_MODE
+ /* Save kernel ptrauth keys. */
+ add x18, x29, #CPU_APIAKEYLO_EL1
+ ptrauth_save_state x18, x19, x20
+
+ /* Use hyp keys. */
+ adr_this_cpu x18, kvm_hyp_ctxt, x19
+ add x18, x18, #CPU_APIAKEYLO_EL1
+ ptrauth_restore_state x18, x19, x20
+ isb
+alternative_else_nop_endif
+__skip_pauth_save:
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
bl handle_trap
- /* Restore host regs x0-x17 */
__host_enter_restore_full:
+ /* Restore kernel keys. */
+#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
+alternative_if_not ARM64_HAS_ADDRESS_AUTH
+b __skip_pauth_restore
+alternative_else_nop_endif
+
+alternative_if ARM64_KVM_PROTECTED_MODE
+ add x18, x29, #CPU_APIAKEYLO_EL1
+ ptrauth_restore_state x18, x19, x20
+alternative_else_nop_endif
+__skip_pauth_restore:
+#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
+
+ /* Restore host regs x0-x17 */
ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index a6d67c2bb5ae..90fade1b032e 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -83,9 +83,6 @@ SYM_CODE_END(__kvm_hyp_init)
* x0: struct kvm_nvhe_init_params PA
*/
SYM_CODE_START_LOCAL(___kvm_hyp_init)
- ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
- msr tpidr_el2, x1
-
ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA]
mov sp, x1
@@ -95,6 +92,22 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
ldr x1, [x0, #NVHE_INIT_HCR_EL2]
msr hcr_el2, x1
+ mov x2, #HCR_E2H
+ and x2, x1, x2
+ cbz x2, 1f
+
+ // hVHE: Replay the EL2 setup to account for the E2H bit
+ // TPIDR_EL2 is used to preserve x0 across the macro maze...
+ isb
+ msr tpidr_el2, x0
+ init_el2_state
+ finalise_el2_state
+ mrs x0, tpidr_el2
+
+1:
+ ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
+ msr tpidr_el2, x1
+
ldr x1, [x0, #NVHE_INIT_VTTBR]
msr vttbr_el2, x1
@@ -128,6 +141,13 @@ alternative_if ARM64_HAS_ADDRESS_AUTH
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
orr x0, x0, x1
alternative_else_nop_endif
+
+#ifdef CONFIG_ARM64_BTI_KERNEL
+alternative_if ARM64_BTI
+ orr x0, x0, #SCTLR_EL2_BT
+alternative_else_nop_endif
+#endif /* CONFIG_ARM64_BTI_KERNEL */
+
msr sctlr_el2, x0
isb
@@ -184,6 +204,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
/* Initialize EL2 CPU state to sane values. */
init_el2_state // Clobbers x0..x2
finalise_el2_state
+ __init_el2_nvhe_prepare_eret
/* Enable MMU, set vectors and stack. */
mov x0, x28
@@ -196,6 +217,11 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
SYM_CODE_END(__kvm_hyp_init_cpu)
SYM_CODE_START(__kvm_handle_stub_hvc)
+ /*
+ * __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so
+ * we need bti j at beginning.
+ */
+ bti j
cmp x0, #HVC_SOFT_RESTART
b.ne 1f
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 728e01d4536b..a169c619db60 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -13,6 +13,7 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include <nvhe/ffa.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
#include <nvhe/pkvm.h>
@@ -125,6 +126,15 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt)
__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
}
+static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt)
+{
+ DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
+ DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2);
+ DECLARE_REG(int, level, host_ctxt, 3);
+
+ __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level);
+}
+
static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
@@ -315,6 +325,7 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
+ HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh),
HANDLE_FUNC(__kvm_tlb_flush_vmid),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
@@ -374,6 +385,8 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
handled = kvm_host_psci_handler(host_ctxt);
if (!handled)
+ handled = kvm_host_ffa_handler(host_ctxt);
+ if (!handled)
default_host_smc_handler(host_ctxt);
/* SMC was trapped, move ELR past the current PC. */
@@ -392,7 +405,11 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
handle_host_smc(host_ctxt);
break;
case ESR_ELx_EC_SVE:
- sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
+ if (has_hvhe())
+ sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN |
+ CPACR_EL1_ZEN_EL0EN));
+ else
+ sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
isb();
sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
break;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index a8813b212996..9d703441278b 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -91,9 +91,9 @@ static void host_s2_put_page(void *addr)
hyp_put_page(&host_s2_pool, addr);
}
-static void host_s2_free_removed_table(void *addr, u32 level)
+static void host_s2_free_unlinked_table(void *addr, u32 level)
{
- kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level);
+ kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level);
}
static int prepare_s2_pool(void *pgt_pool_base)
@@ -110,7 +110,7 @@ static int prepare_s2_pool(void *pgt_pool_base)
host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
.zalloc_pages_exact = host_s2_zalloc_pages_exact,
.zalloc_page = host_s2_zalloc_page,
- .free_removed_table = host_s2_free_removed_table,
+ .free_unlinked_table = host_s2_free_unlinked_table,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.page_count = hyp_page_count,
@@ -842,6 +842,13 @@ static int check_share(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_ack_share(completer_addr, tx, share->completer_prot);
break;
+ case PKVM_ID_FFA:
+ /*
+ * We only check the host; the secure side will check the other
+ * end when we forward the FFA call.
+ */
+ ret = 0;
+ break;
default:
ret = -EINVAL;
}
@@ -870,6 +877,13 @@ static int __do_share(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_complete_share(completer_addr, tx, share->completer_prot);
break;
+ case PKVM_ID_FFA:
+ /*
+ * We're not responsible for any secure page-tables, so there's
+ * nothing to do here.
+ */
+ ret = 0;
+ break;
default:
ret = -EINVAL;
}
@@ -918,6 +932,10 @@ static int check_unshare(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_ack_unshare(completer_addr, tx);
break;
+ case PKVM_ID_FFA:
+ /* See check_share() */
+ ret = 0;
+ break;
default:
ret = -EINVAL;
}
@@ -946,6 +964,10 @@ static int __do_unshare(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_complete_unshare(completer_addr, tx);
break;
+ case PKVM_ID_FFA:
+ /* See __do_share() */
+ ret = 0;
+ break;
default:
ret = -EINVAL;
}
@@ -1235,3 +1257,49 @@ void hyp_unpin_shared_mem(void *from, void *to)
hyp_unlock_component();
host_unlock_component();
}
+
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ struct pkvm_mem_share share = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = hyp_pfn_to_phys(pfn),
+ },
+ .completer = {
+ .id = PKVM_ID_FFA,
+ },
+ },
+ };
+
+ host_lock_component();
+ ret = do_share(&share);
+ host_unlock_component();
+
+ return ret;
+}
+
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
+{
+ int ret;
+ struct pkvm_mem_share share = {
+ .tx = {
+ .nr_pages = nr_pages,
+ .initiator = {
+ .id = PKVM_ID_HOST,
+ .addr = hyp_pfn_to_phys(pfn),
+ },
+ .completer = {
+ .id = PKVM_ID_FFA,
+ },
+ },
+ };
+
+ host_lock_component();
+ ret = do_unshare(&share);
+ host_unlock_component();
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index a06ece14a6d8..8033ef353a5d 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -27,6 +27,7 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
u64 hcr_set = HCR_RW;
u64 hcr_clear = 0;
u64 cptr_set = 0;
+ u64 cptr_clear = 0;
/* Protected KVM does not support AArch32 guests. */
BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0),
@@ -43,6 +44,9 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD),
PVM_ID_AA64PFR0_ALLOW));
+ if (has_hvhe())
+ hcr_set |= HCR_E2H;
+
/* Trap RAS unless all current versions are supported */
if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) <
ID_AA64PFR0_EL1_RAS_V1P1) {
@@ -57,12 +61,17 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
}
/* Trap SVE */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids))
- cptr_set |= CPTR_EL2_TZ;
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) {
+ if (has_hvhe())
+ cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
+ else
+ cptr_set |= CPTR_EL2_TZ;
+ }
vcpu->arch.hcr_el2 |= hcr_set;
vcpu->arch.hcr_el2 &= ~hcr_clear;
vcpu->arch.cptr_el2 |= cptr_set;
+ vcpu->arch.cptr_el2 &= ~cptr_clear;
}
/*
@@ -120,8 +129,12 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu)
mdcr_set |= MDCR_EL2_TTRF;
/* Trap Trace */
- if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids))
- cptr_set |= CPTR_EL2_TTA;
+ if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) {
+ if (has_hvhe())
+ cptr_set |= CPACR_EL1_TTA;
+ else
+ cptr_set |= CPTR_EL2_TTA;
+ }
vcpu->arch.mdcr_el2 |= mdcr_set;
vcpu->arch.mdcr_el2 &= ~mdcr_clear;
@@ -176,8 +189,10 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
/* Clear res0 and set res1 bits to trap potential new features. */
vcpu->arch.hcr_el2 &= ~(HCR_RES0);
vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0);
- vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
- vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
+ if (!has_hvhe()) {
+ vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
+ vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
+ }
}
/*
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 110f04627785..bb98630dfeaf 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -11,6 +11,7 @@
#include <asm/kvm_pkvm.h>
#include <nvhe/early_alloc.h>
+#include <nvhe/ffa.h>
#include <nvhe/fixed_config.h>
#include <nvhe/gfp.h>
#include <nvhe/memory.h>
@@ -28,6 +29,7 @@ static void *vmemmap_base;
static void *vm_table_base;
static void *hyp_pgt_base;
static void *host_s2_pgt_base;
+static void *ffa_proxy_pages;
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
static struct hyp_pool hpool;
@@ -57,6 +59,11 @@ static int divide_memory_pool(void *virt, unsigned long size)
if (!host_s2_pgt_base)
return -ENOMEM;
+ nr_pages = hyp_ffa_proxy_pages();
+ ffa_proxy_pages = hyp_early_alloc_contig(nr_pages);
+ if (!ffa_proxy_pages)
+ return -ENOMEM;
+
return 0;
}
@@ -314,6 +321,10 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
+ ret = hyp_ffa_init(ffa_proxy_pages);
+ if (ret)
+ goto out;
+
pkvm_hyp_vm_table_init(vm_table_base);
out:
/*
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 77791495c995..0a6271052def 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -44,13 +44,24 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
__activate_traps_common(vcpu);
val = vcpu->arch.cptr_el2;
- val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
+ val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */
+ val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA;
+ if (cpus_have_final_cap(ARM64_SME)) {
+ if (has_hvhe())
+ val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN);
+ else
+ val |= CPTR_EL2_TSM;
+ }
+
if (!guest_owns_fp_regs(vcpu)) {
- val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
+ if (has_hvhe())
+ val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |
+ CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN);
+ else
+ val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
+
__activate_traps_fpsimd32(vcpu);
}
- if (cpus_have_final_cap(ARM64_SME))
- val |= CPTR_EL2_TSM;
write_sysreg(val, cptr_el2);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
@@ -73,7 +84,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
extern char __kvm_hyp_host_vector[];
- u64 cptr;
___deactivate_traps(vcpu);
@@ -98,13 +108,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
- cptr = CPTR_EL2_DEFAULT;
- if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED))
- cptr |= CPTR_EL2_TZ;
- if (cpus_have_final_cap(ARM64_SME))
- cptr &= ~CPTR_EL2_TSM;
-
- write_sysreg(cptr, cptr_el2);
+ kvm_reset_cptr_el2(vcpu);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
index b185ac0dbd47..3aaab20ae5b4 100644
--- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c
+++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c
@@ -17,21 +17,24 @@ void __kvm_timer_set_cntvoff(u64 cntvoff)
}
/*
- * Should only be called on non-VHE systems.
+ * Should only be called on non-VHE or hVHE setups.
* VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
*/
void __timer_disable_traps(struct kvm_vcpu *vcpu)
{
- u64 val;
+ u64 val, shift = 0;
+
+ if (has_hvhe())
+ shift = 10;
/* Allow physical timer/counter access for the host */
val = read_sysreg(cnthctl_el2);
- val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
+ val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift;
write_sysreg(val, cnthctl_el2);
}
/*
- * Should only be called on non-VHE systems.
+ * Should only be called on non-VHE or hVHE setups.
* VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
*/
void __timer_enable_traps(struct kvm_vcpu *vcpu)
@@ -50,5 +53,10 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu)
else
clr |= CNTHCTL_EL1PCTEN;
+ if (has_hvhe()) {
+ clr <<= 10;
+ set <<= 10;
+ }
+
sysreg_clear_set(cnthctl_el2, clr, set);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 978179133f4b..b9991bbd8e3f 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -130,6 +130,58 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
__tlb_switch_to_host(&cxt);
}
+void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt, true);
+
+ /*
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
+ */
+ ipa >>= 12;
+ __tlbi_level(ipas2e1, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(nsh);
+ __tlbi(vmalle1);
+ dsb(nsh);
+ isb();
+
+ /*
+ * If the host is running at EL1 and we have a VPIPT I-cache,
+ * then we must perform I-cache maintenance at EL2 in order for
+ * it to have an effect on the guest. Since the guest cannot hit
+ * I-cache lines allocated with a different VMID, we don't need
+ * to worry about junk out of guest reset (we nuke the I-cache on
+ * VMID rollover), but we do need to be careful when remapping
+ * executable pages for the same guest. This can happen when KSM
+ * takes a CoW fault on an executable page, copies the page into
+ * a page that was previously mapped in the guest and then needs
+ * to invalidate the guest view of the I-cache for that page
+ * from EL1. To solve this, we invalidate the entire I-cache when
+ * unmapping a page from a guest if we have a VPIPT I-cache but
+ * the host is running at EL1. As above, we could do better if
+ * we had the VA.
+ *
+ * The moral of this story is: if you have a VPIPT I-cache, then
+ * you should be running with VHE enabled.
+ */
+ if (icache_is_vpipt())
+ icache_inval_all_pou();
+
+ __tlb_switch_to_host(&cxt);
+}
+
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 95dae02ccc2e..aa740a974e02 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -21,8 +21,10 @@
#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \
+ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \
+ ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
@@ -34,7 +36,7 @@
#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
-#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50)
#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55)
@@ -42,6 +44,8 @@
#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
+
#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
KVM_PTE_LEAF_ATTR_HI_S2_XN)
@@ -63,6 +67,16 @@ struct kvm_pgtable_walk_data {
const u64 end;
};
+static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
+}
+
+static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
+}
+
static bool kvm_phys_is_valid(u64 phys)
{
return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
@@ -386,6 +400,9 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
if (device)
return -EINVAL;
+
+ if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti())
+ attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP;
} else {
attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
}
@@ -623,10 +640,18 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
#ifdef CONFIG_ARM64_HW_AFDBM
/*
* Enable the Hardware Access Flag management, unconditionally
- * on all CPUs. The features is RES0 on CPUs without the support
- * and must be ignored by the CPUs.
+ * on all CPUs. In systems that have asymmetric support for the feature
+ * this allows KVM to leverage hardware support on the subset of cores
+ * that implement the feature.
+ *
+ * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
+ * hardware) on implementations that do not advertise support for the
+ * feature. As such, setting HA unconditionally is safe, unless you
+ * happen to be running on a design that has unadvertised support for
+ * HAFDBS. Here be dragons.
*/
- vtcr |= VTCR_EL2_HA;
+ if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
+ vtcr |= VTCR_EL2_HA;
#endif /* CONFIG_ARM64_HW_AFDBM */
/* Set the vmid bits */
@@ -755,14 +780,17 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
return false;
- /*
- * Perform the appropriate TLB invalidation based on the evicted pte
- * value (if any).
- */
- if (kvm_pte_table(ctx->old, ctx->level))
- kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
- else if (kvm_pte_valid(ctx->old))
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level);
+ if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
+ /*
+ * Perform the appropriate TLB invalidation based on the
+ * evicted pte value (if any).
+ */
+ if (kvm_pte_table(ctx->old, ctx->level))
+ kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
+ else if (kvm_pte_valid(ctx->old))
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
+ ctx->addr, ctx->level);
+ }
if (stage2_pte_is_counted(ctx->old))
mm_ops->put_page(ctx->ptep);
@@ -869,11 +897,13 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
return -EAGAIN;
/* Perform CMOs before installation of the guest stage-2 PTE */
- if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
+ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
+ stage2_pte_cacheable(pgt, new))
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
- granule);
+ granule);
- if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
+ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
+ stage2_pte_executable(new))
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
stage2_make_pte(ctx, new);
@@ -895,7 +925,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
if (ret)
return ret;
- mm_ops->free_removed_table(childp, ctx->level);
+ mm_ops->free_unlinked_table(childp, ctx->level);
return 0;
}
@@ -940,7 +970,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
* The TABLE_PRE callback runs for table entries on the way down, looking
* for table entries which we could conceivably replace with a block entry
* for this mapping. If it finds one it replaces the entry and calls
- * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table.
+ * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
*
* Otherwise, the LEAF callback performs the mapping at the existing leaves
* instead.
@@ -1209,7 +1239,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
KVM_PGTABLE_WALK_HANDLE_FAULT |
KVM_PGTABLE_WALK_SHARED);
if (!ret)
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
return ret;
}
@@ -1242,6 +1272,162 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
return kvm_pgtable_walk(pgt, addr, size, &walker);
}
+kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
+ u64 phys, u32 level,
+ enum kvm_pgtable_prot prot,
+ void *mc, bool force_pte)
+{
+ struct stage2_map_data map_data = {
+ .phys = phys,
+ .mmu = pgt->mmu,
+ .memcache = mc,
+ .force_pte = force_pte,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_map_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
+ KVM_PGTABLE_WALK_SKIP_CMO,
+ .arg = &map_data,
+ };
+ /*
+ * The input address (.addr) is irrelevant for walking an
+ * unlinked table. Construct an ambiguous IA range to map
+ * kvm_granule_size(level) worth of memory.
+ */
+ struct kvm_pgtable_walk_data data = {
+ .walker = &walker,
+ .addr = 0,
+ .end = kvm_granule_size(level),
+ };
+ struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
+ kvm_pte_t *pgtable;
+ int ret;
+
+ if (!IS_ALIGNED(phys, kvm_granule_size(level)))
+ return ERR_PTR(-EINVAL);
+
+ ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ pgtable = mm_ops->zalloc_page(mc);
+ if (!pgtable)
+ return ERR_PTR(-ENOMEM);
+
+ ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
+ level + 1);
+ if (ret) {
+ kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level);
+ mm_ops->put_page(pgtable);
+ return ERR_PTR(ret);
+ }
+
+ return pgtable;
+}
+
+/*
+ * Get the number of page-tables needed to replace a block with a
+ * fully populated tree up to the PTE entries. Note that @level is
+ * interpreted as in "level @level entry".
+ */
+static int stage2_block_get_nr_page_tables(u32 level)
+{
+ switch (level) {
+ case 1:
+ return PTRS_PER_PTE + 1;
+ case 2:
+ return 1;
+ case 3:
+ return 0;
+ default:
+ WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
+ level >= KVM_PGTABLE_MAX_LEVELS);
+ return -EINVAL;
+ };
+}
+
+static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ struct kvm_mmu_memory_cache *mc = ctx->arg;
+ struct kvm_s2_mmu *mmu;
+ kvm_pte_t pte = ctx->old, new, *childp;
+ enum kvm_pgtable_prot prot;
+ u32 level = ctx->level;
+ bool force_pte;
+ int nr_pages;
+ u64 phys;
+
+ /* No huge-pages exist at the last level */
+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return 0;
+
+ /* We only split valid block mappings */
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ nr_pages = stage2_block_get_nr_page_tables(level);
+ if (nr_pages < 0)
+ return nr_pages;
+
+ if (mc->nobjs >= nr_pages) {
+ /* Build a tree mapped down to the PTE granularity. */
+ force_pte = true;
+ } else {
+ /*
+ * Don't force PTEs, so create_unlinked() below does
+ * not populate the tree up to the PTE level. The
+ * consequence is that the call will require a single
+ * page of level 2 entries at level 1, or a single
+ * page of PTEs at level 2. If we are at level 1, the
+ * PTEs will be created recursively.
+ */
+ force_pte = false;
+ nr_pages = 1;
+ }
+
+ if (mc->nobjs < nr_pages)
+ return -ENOMEM;
+
+ mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
+ phys = kvm_pte_to_phys(pte);
+ prot = kvm_pgtable_stage2_pte_prot(pte);
+
+ childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
+ level, prot, mc, force_pte);
+ if (IS_ERR(childp))
+ return PTR_ERR(childp);
+
+ if (!stage2_try_break_pte(ctx, mmu)) {
+ kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
+ mm_ops->put_page(childp);
+ return -EAGAIN;
+ }
+
+ /*
+ * Note, the contents of the page table are guaranteed to be made
+ * visible before the new PTE is assigned because stage2_make_pte()
+ * writes the PTE using smp_store_release().
+ */
+ new = kvm_init_table_pte(childp, mm_ops);
+ stage2_make_pte(ctx, new);
+ dsb(ishst);
+ return 0;
+}
+
+int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_mmu_memory_cache *mc)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_split_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = mc,
+ };
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
struct kvm_pgtable_mm_ops *mm_ops,
@@ -1311,7 +1497,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
pgt->pgd = NULL;
}
-void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
+void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level)
{
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
struct kvm_pgtable_walker walker = {
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index b37e7c96efea..6537f58b1a8c 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -84,7 +84,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
*/
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
- write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
+ kvm_reset_cptr_el2(vcpu);
if (!arm64_kernel_unmapped_at_el0())
host_vectors = __this_cpu_read(this_cpu_vector);
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index 24cef9b87f9e..e69da550cdc5 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -111,6 +111,38 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
__tlb_switch_to_host(&cxt);
}
+void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
+ phys_addr_t ipa, int level)
+{
+ struct tlb_inv_context cxt;
+
+ dsb(nshst);
+
+ /* Switch to requested VMID */
+ __tlb_switch_to_guest(mmu, &cxt);
+
+ /*
+ * We could do so much better if we had the VA as well.
+ * Instead, we invalidate Stage-2 for this IPA, and the
+ * whole of Stage-1. Weep...
+ */
+ ipa >>= 12;
+ __tlbi_level(ipas2e1, ipa, level);
+
+ /*
+ * We have to ensure completion of the invalidation at Stage-2,
+ * since a table walk on another CPU could refill a TLB with a
+ * complete (S1 + S2) walk based on the old Stage-2 mapping if
+ * the Stage-1 invalidation happened first.
+ */
+ dsb(nsh);
+ __tlbi(vmalle1);
+ dsb(nsh);
+ isb();
+
+ __tlb_switch_to_host(&cxt);
+}
+
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3b9d4d24c361..6db9ef288ec3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -31,14 +31,21 @@ static phys_addr_t __ro_after_init hyp_idmap_vector;
static unsigned long __ro_after_init io_map_base;
-static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
+ phys_addr_t size)
{
- phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
return (boundary - 1 < end - 1) ? boundary : end;
}
+static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
+
+ return __stage2_range_addr_end(addr, end, size);
+}
+
/*
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
@@ -75,6 +82,79 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
#define stage2_apply_range_resched(mmu, addr, end, fn) \
stage2_apply_range(mmu, addr, end, fn, true)
+/*
+ * Get the maximum number of page-tables pages needed to split a range
+ * of blocks into PAGE_SIZE PTEs. It assumes the range is already
+ * mapped at level 2, or at level 1 if allowed.
+ */
+static int kvm_mmu_split_nr_page_tables(u64 range)
+{
+ int n = 0;
+
+ if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
+ n += DIV_ROUND_UP(range, PUD_SIZE);
+ n += DIV_ROUND_UP(range, PMD_SIZE);
+ return n;
+}
+
+static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
+{
+ struct kvm_mmu_memory_cache *cache;
+ u64 chunk_size, min;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ min = kvm_mmu_split_nr_page_tables(chunk_size);
+ cache = &kvm->arch.mmu.split_page_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end)
+{
+ struct kvm_mmu_memory_cache *cache;
+ struct kvm_pgtable *pgt;
+ int ret, cache_capacity;
+ u64 next, chunk_size;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
+
+ if (chunk_size == 0)
+ return 0;
+
+ cache = &kvm->arch.mmu.split_page_cache;
+
+ do {
+ if (need_split_memcache_topup_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /* Eager page splitting is best-effort. */
+ ret = __kvm_mmu_topup_memory_cache(cache,
+ cache_capacity,
+ cache_capacity);
+ write_lock(&kvm->mmu_lock);
+ if (ret)
+ break;
+ }
+
+ pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = __stage2_range_addr_end(addr, end, chunk_size);
+ ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
+ if (ret)
+ break;
+ } while (addr = next, addr != end);
+
+ return ret;
+}
+
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -131,21 +211,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size)
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
-static void stage2_free_removed_table_rcu_cb(struct rcu_head *head)
+static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
{
struct page *page = container_of(head, struct page, rcu_head);
void *pgtable = page_to_virt(page);
u32 level = page_private(page);
- kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level);
+ kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);
}
-static void stage2_free_removed_table(void *addr, u32 level)
+static void stage2_free_unlinked_table(void *addr, u32 level)
{
struct page *page = virt_to_page(addr);
set_page_private(page, (unsigned long)level);
- call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb);
+ call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
}
static void kvm_host_get_page(void *addr)
@@ -701,7 +781,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page = stage2_memcache_zalloc_page,
.zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
.free_pages_exact = kvm_s2_free_pages_exact,
- .free_removed_table = stage2_free_removed_table,
+ .free_unlinked_table = stage2_free_unlinked_table,
.get_page = kvm_host_get_page,
.put_page = kvm_s2_put_page,
.page_count = kvm_host_page_count,
@@ -775,6 +855,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
+ /* The eager page splitting is disabled by default */
+ mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ mmu->split_page_cache.gfp_zero = __GFP_ZERO;
+
mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
return 0;
@@ -786,6 +870,12 @@ out_free_pgtable:
return err;
}
+void kvm_uninit_stage2_mmu(struct kvm *kvm)
+{
+ kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
+}
+
static void stage2_unmap_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
@@ -989,39 +1079,66 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
}
/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
+ * pages for memory slot
* @kvm: The KVM pointer
- * @slot: The memory slot associated with mask
- * @gfn_offset: The gfn offset in memory slot
- * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
- * slot to be write protected
+ * @slot: The memory slot to split
*
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
+ * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
*/
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask)
+static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
{
- phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
- phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
- phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ phys_addr_t start, end;
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ lockdep_assert_held(&kvm->slots_lock);
+
+ slots = kvm_memslots(kvm);
+ memslot = id_to_memslot(slots, slot);
+
+ start = memslot->base_gfn << PAGE_SHIFT;
+ end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ write_lock(&kvm->mmu_lock);
+ kvm_mmu_split_huge_pages(kvm, start, end);
+ write_unlock(&kvm->mmu_lock);
}
/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
+ * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
+ * @kvm: The KVM pointer
+ * @slot: The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask: The mask of pages at offset 'gfn_offset' in this memory
+ * slot to enable dirty logging on
*
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
+ * Writes protect selected pages to enable dirty logging, and then
+ * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
*/
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+ phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+ phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
+ phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ stage2_wp_range(&kvm->arch.mmu, start, end);
+
+ /*
+ * Eager-splitting is done when manual-protect is set. We
+ * also check for initially-all-set because we can avoid
+ * eager-splitting if initially-all-set is false.
+ * Initially-all-set equal false implies that huge-pages were
+ * already split when enabling dirty logging: no need to do it
+ * again.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ kvm_mmu_split_huge_pages(kvm, start, end);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -1790,20 +1907,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+
/*
* At this point memslot has been committed and there is an
* allocated dirty_bitmap[], dirty pages will be tracked while the
* memory slot is write protected.
*/
- if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (log_dirty_pages) {
+
+ if (change == KVM_MR_DELETE)
+ return;
+
/*
- * If we're with initial-all-set, we don't need to write
- * protect any pages because they're all reported as dirty.
- * Huge pages and normal pages will be write protect gradually.
+ * Huge and normal pages are write-protected and split
+ * on either of these two cases:
+ *
+ * 1. with initial-all-set: gradually with CLEAR ioctls,
*/
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- kvm_mmu_wp_memory_region(kvm, new->id);
- }
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
+ /*
+ * or
+ * 2. without initial-all-set: all in one shot when
+ * enabling dirty logging.
+ */
+ kvm_mmu_wp_memory_region(kvm, new->id);
+ kvm_mmu_split_memory_region(kvm, new->id);
+ } else {
+ /*
+ * Free any leftovers from the eager page splitting cache. Do
+ * this when deleting, moving, disabling dirty logging, or
+ * creating the memslot (a nop). Doing it for deletes makes
+ * sure we don't leak memory, and there's no need to keep the
+ * cache around for any of the other cases.
+ */
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
}
}
@@ -1877,7 +2016,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_uninit_stage2_mmu(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 6e9ece1ebbe7..994a494703c3 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -78,6 +78,7 @@ void __init kvm_hyp_reserve(void)
hyp_mem_pages += host_s2_pgtable_pages();
hyp_mem_pages += hyp_vm_table_pages();
hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
+ hyp_mem_pages += hyp_ffa_proxy_pages();
/*
* Try to allocate a PMD-aligned region to reduce TLB pressure once
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index b5dee8e57e77..bc8556b6f459 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -187,57 +187,6 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
}
/**
- * kvm_set_vm_width() - set the register width for the guest
- * @vcpu: Pointer to the vcpu being configured
- *
- * Set both KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED
- * in the VM flags based on the vcpu's requested register width, the HW
- * capabilities and other options (such as MTE).
- * When REG_WIDTH_CONFIGURED is already set, the vcpu settings must be
- * consistent with the value of the FLAG_EL1_32BIT bit in the flags.
- *
- * Return: 0 on success, negative error code on failure.
- */
-static int kvm_set_vm_width(struct kvm_vcpu *vcpu)
-{
- struct kvm *kvm = vcpu->kvm;
- bool is32bit;
-
- is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT);
-
- lockdep_assert_held(&kvm->arch.config_lock);
-
- if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) {
- /*
- * The guest's register width is already configured.
- * Make sure that the vcpu is consistent with it.
- */
- if (is32bit == test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags))
- return 0;
-
- return -EINVAL;
- }
-
- if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit)
- return -EINVAL;
-
- /* MTE is incompatible with AArch32 */
- if (kvm_has_mte(kvm) && is32bit)
- return -EINVAL;
-
- /* NV is incompatible with AArch32 */
- if (vcpu_has_nv(vcpu) && is32bit)
- return -EINVAL;
-
- if (is32bit)
- set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags);
-
- set_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags);
-
- return 0;
-}
-
-/**
* kvm_reset_vcpu - sets core registers and sys_regs to reset value
* @vcpu: The VCPU pointer
*
@@ -262,13 +211,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
bool loaded;
u32 pstate;
- mutex_lock(&vcpu->kvm->arch.config_lock);
- ret = kvm_set_vm_width(vcpu);
- mutex_unlock(&vcpu->kvm->arch.config_lock);
-
- if (ret)
- return ret;
-
spin_lock(&vcpu->arch.mp_state_lock);
reset_state = vcpu->arch.reset_state;
vcpu->arch.reset_state.reset = false;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 5b5d5e5449dc..bd3431823ec5 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -42,6 +42,8 @@
*/
static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
+static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ u64 val);
static bool read_from_write_only(struct kvm_vcpu *vcpu,
struct sys_reg_params *params,
@@ -553,10 +555,11 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return 0;
}
-static void reset_bvr(struct kvm_vcpu *vcpu,
+static u64 reset_bvr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val;
+ return rd->val;
}
static bool trap_bcr(struct kvm_vcpu *vcpu,
@@ -589,10 +592,11 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return 0;
}
-static void reset_bcr(struct kvm_vcpu *vcpu,
+static u64 reset_bcr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val;
+ return rd->val;
}
static bool trap_wvr(struct kvm_vcpu *vcpu,
@@ -626,10 +630,11 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return 0;
}
-static void reset_wvr(struct kvm_vcpu *vcpu,
+static u64 reset_wvr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val;
+ return rd->val;
}
static bool trap_wcr(struct kvm_vcpu *vcpu,
@@ -662,25 +667,28 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
return 0;
}
-static void reset_wcr(struct kvm_vcpu *vcpu,
+static u64 reset_wcr(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd)
{
vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val;
+ return rd->val;
}
-static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 amair = read_sysreg(amair_el1);
vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1);
+ return amair;
}
-static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 actlr = read_sysreg(actlr_el1);
vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1);
+ return actlr;
}
-static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 mpidr;
@@ -694,7 +702,10 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
- vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1);
+ mpidr |= (1ULL << 31);
+ vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1);
+
+ return mpidr;
}
static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
@@ -706,13 +717,13 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
-static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX);
/* No PMU available, any PMU reg may UNDEF... */
if (!kvm_arm_support_pmu_v3())
- return;
+ return 0;
n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT;
n &= ARMV8_PMU_PMCR_N_MASK;
@@ -721,33 +732,41 @@ static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
reset_unknown(vcpu, r);
__vcpu_sys_reg(vcpu, r->reg) &= mask;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
reset_unknown(vcpu, r);
__vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
reset_unknown(vcpu, r);
__vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
reset_unknown(vcpu, r);
__vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 pmcr;
/* No PMU available, PMCR_EL0 may UNDEF... */
if (!kvm_arm_support_pmu_v3())
- return;
+ return 0;
/* Only preserve PMCR_EL0.N, and reset the rest to 0 */
pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT);
@@ -755,6 +774,8 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
pmcr |= ARMV8_PMU_PMCR_LC;
__vcpu_sys_reg(vcpu, r->reg) = pmcr;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
@@ -1187,25 +1208,89 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
return true;
}
-static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu)
+static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp,
+ s64 new, s64 cur)
{
- if (kvm_vcpu_has_pmu(vcpu))
- return vcpu->kvm->arch.dfr0_pmuver.imp;
+ struct arm64_ftr_bits kvm_ftr = *ftrp;
+
+ /* Some features have different safe value type in KVM than host features */
+ switch (id) {
+ case SYS_ID_AA64DFR0_EL1:
+ if (kvm_ftr.shift == ID_AA64DFR0_EL1_PMUVer_SHIFT)
+ kvm_ftr.type = FTR_LOWER_SAFE;
+ break;
+ case SYS_ID_DFR0_EL1:
+ if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT)
+ kvm_ftr.type = FTR_LOWER_SAFE;
+ break;
+ }
- return vcpu->kvm->arch.dfr0_pmuver.unimp;
+ return arm64_ftr_safe_value(&kvm_ftr, new, cur);
}
-static u8 perfmon_to_pmuver(u8 perfmon)
+/**
+ * arm64_check_features() - Check if a feature register value constitutes
+ * a subset of features indicated by the idreg's KVM sanitised limit.
+ *
+ * This function will check if each feature field of @val is the "safe" value
+ * against idreg's KVM sanitised limit return from reset() callback.
+ * If a field value in @val is the same as the one in limit, it is always
+ * considered the safe value regardless For register fields that are not in
+ * writable, only the value in limit is considered the safe value.
+ *
+ * Return: 0 if all the fields are safe. Otherwise, return negative errno.
+ */
+static int arm64_check_features(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd,
+ u64 val)
{
- switch (perfmon) {
- case ID_DFR0_EL1_PerfMon_PMUv3:
- return ID_AA64DFR0_EL1_PMUVer_IMP;
- case ID_DFR0_EL1_PerfMon_IMPDEF:
- return ID_AA64DFR0_EL1_PMUVer_IMP_DEF;
- default:
- /* Anything ARMv8.1+ and NI have the same value. For now. */
- return perfmon;
+ const struct arm64_ftr_reg *ftr_reg;
+ const struct arm64_ftr_bits *ftrp = NULL;
+ u32 id = reg_to_encoding(rd);
+ u64 writable_mask = rd->val;
+ u64 limit = rd->reset(vcpu, rd);
+ u64 mask = 0;
+
+ /*
+ * Hidden and unallocated ID registers may not have a corresponding
+ * struct arm64_ftr_reg. Of course, if the register is RAZ we know the
+ * only safe value is 0.
+ */
+ if (sysreg_visible_as_raz(vcpu, rd))
+ return val ? -E2BIG : 0;
+
+ ftr_reg = get_arm64_ftr_reg(id);
+ if (!ftr_reg)
+ return -EINVAL;
+
+ ftrp = ftr_reg->ftr_bits;
+
+ for (; ftrp && ftrp->width; ftrp++) {
+ s64 f_val, f_lim, safe_val;
+ u64 ftr_mask;
+
+ ftr_mask = arm64_ftr_mask(ftrp);
+ if ((ftr_mask & writable_mask) != ftr_mask)
+ continue;
+
+ f_val = arm64_ftr_value(ftrp, val);
+ f_lim = arm64_ftr_value(ftrp, limit);
+ mask |= ftr_mask;
+
+ if (f_val == f_lim)
+ safe_val = f_val;
+ else
+ safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim);
+
+ if (safe_val != f_val)
+ return -E2BIG;
}
+
+ /* For fields that are not writable, values in limit are the safe values. */
+ if ((val & ~mask) != (limit & ~mask))
+ return -E2BIG;
+
+ return 0;
}
static u8 pmuver_to_perfmon(u8 pmuver)
@@ -1222,7 +1307,8 @@ static u8 pmuver_to_perfmon(u8 pmuver)
}
/* Read a sanitised cpufeature ID register by sys_reg_desc */
-static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r)
+static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
{
u32 id = reg_to_encoding(r);
u64 val;
@@ -1233,19 +1319,6 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
val = read_sanitised_ftr_reg(id);
switch (id) {
- case SYS_ID_AA64PFR0_EL1:
- if (!vcpu_has_sve(vcpu))
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2);
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3);
- if (kvm_vgic_global_state.type == VGIC_V3) {
- val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1);
- }
- break;
case SYS_ID_AA64PFR1_EL1:
if (!kvm_has_mte(vcpu->kvm))
val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
@@ -1267,22 +1340,6 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS);
break;
- case SYS_ID_AA64DFR0_EL1:
- /* Limit debug to ARMv8.0 */
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6);
- /* Set PMUver to the required version */
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
- vcpu_pmuver(vcpu));
- /* Hide SPE from guests */
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer);
- break;
- case SYS_ID_DFR0_EL1:
- val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon);
- val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon),
- pmuver_to_perfmon(vcpu_pmuver(vcpu)));
- break;
case SYS_ID_AA64MMFR2_EL1:
val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
break;
@@ -1294,6 +1351,28 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
return val;
}
+static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
+{
+ return __kvm_read_sanitised_id_reg(vcpu, r);
+}
+
+static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ return IDREG(vcpu->kvm, reg_to_encoding(r));
+}
+
+/*
+ * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is
+ * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8.
+ */
+static inline bool is_id_reg(u32 id)
+{
+ return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
+ sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
+ sys_reg_CRm(id) < 8);
+}
+
static unsigned int id_visibility(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
@@ -1355,88 +1434,113 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
return REG_HIDDEN;
}
-static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *rd,
- u64 val)
+static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
{
- u8 csv2, csv3;
+ u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+ if (!vcpu_has_sve(vcpu))
+ val &= ~ID_AA64PFR0_EL1_SVE_MASK;
/*
- * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as
- * it doesn't promise more than what is actually provided (the
- * guest could otherwise be covered in ectoplasmic residue).
+ * The default is to expose CSV2 == 1 if the HW isn't affected.
+ * Although this is a per-CPU feature, we make it global because
+ * asymmetric systems are just a nuisance.
+ *
+ * Userspace can override this as long as it doesn't promise
+ * the impossible.
*/
- csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT);
- if (csv2 > 1 ||
- (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED))
- return -EINVAL;
+ if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) {
+ val &= ~ID_AA64PFR0_EL1_CSV2_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP);
+ }
+ if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) {
+ val &= ~ID_AA64PFR0_EL1_CSV3_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP);
+ }
- /* Same thing for CSV3 */
- csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT);
- if (csv3 > 1 ||
- (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED))
- return -EINVAL;
+ if (kvm_vgic_global_state.type == VGIC_V3) {
+ val &= ~ID_AA64PFR0_EL1_GIC_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
+ }
- /* We can only differ with CSV[23], and anything else is an error */
- val ^= read_id_reg(vcpu, rd);
- val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
- ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));
- if (val)
- return -EINVAL;
+ val &= ~ID_AA64PFR0_EL1_AMU_MASK;
- vcpu->kvm->arch.pfr0_csv2 = csv2;
- vcpu->kvm->arch.pfr0_csv3 = csv3;
+ return val;
+}
- return 0;
+static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ u64 val = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
+
+ /* Limit debug to ARMv8.0 */
+ val &= ~ID_AA64DFR0_EL1_DebugVer_MASK;
+ val |= SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DebugVer, IMP);
+
+ /*
+ * Only initialize the PMU version if the vCPU was configured with one.
+ */
+ val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
+ if (kvm_vcpu_has_pmu(vcpu))
+ val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer,
+ kvm_arm_pmu_get_pmuver_limit());
+
+ /* Hide SPE from guests */
+ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK;
+
+ return val;
}
static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
u64 val)
{
- u8 pmuver, host_pmuver;
- bool valid_pmu;
-
- host_pmuver = kvm_arm_pmu_get_pmuver_limit();
+ u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val);
/*
- * Allow AA64DFR0_EL1.PMUver to be set from userspace as long
- * as it doesn't promise more than what the HW gives us. We
- * allow an IMPDEF PMU though, only if no PMU is supported
- * (KVM backward compatibility handling).
+ * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the
+ * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously
+ * exposed an IMP_DEF PMU to userspace and the guest on systems w/
+ * non-architectural PMUs. Of course, PMUv3 is the only game in town for
+ * PMU virtualization, so the IMP_DEF value was rather user-hostile.
+ *
+ * At minimum, we're on the hook to allow values that were given to
+ * userspace by KVM. Cover our tracks here and replace the IMP_DEF value
+ * with a more sensible NI. The value of an ID register changing under
+ * the nose of the guest is unfortunate, but is certainly no more
+ * surprising than an ill-guided PMU driver poking at impdef system
+ * registers that end in an UNDEF...
*/
- pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val);
- if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver))
- return -EINVAL;
+ if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
+ val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
- valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF);
-
- /* Make sure view register and PMU support do match */
- if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
- return -EINVAL;
+ return set_id_reg(vcpu, rd, val);
+}
- /* We can only differ with PMUver, and anything else is an error */
- val ^= read_id_reg(vcpu, rd);
- val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer);
- if (val)
- return -EINVAL;
+static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ u8 perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+ u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1);
- if (valid_pmu)
- vcpu->kvm->arch.dfr0_pmuver.imp = pmuver;
- else
- vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver;
+ val &= ~ID_DFR0_EL1_PerfMon_MASK;
+ if (kvm_vcpu_has_pmu(vcpu))
+ val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon);
- return 0;
+ return val;
}
static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
u64 val)
{
- u8 perfmon, host_perfmon;
- bool valid_pmu;
+ u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val);
- host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
+ if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) {
+ val &= ~ID_DFR0_EL1_PerfMon_MASK;
+ perfmon = 0;
+ }
/*
* Allow DFR0_EL1.PerfMon to be set from userspace as long as
@@ -1444,29 +1548,10 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
* AArch64 side (as everything is emulated with that), and
* that this is a PMUv3.
*/
- perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), val);
- if ((perfmon != ID_DFR0_EL1_PerfMon_IMPDEF && perfmon > host_perfmon) ||
- (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3))
+ if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3)
return -EINVAL;
- valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_EL1_PerfMon_IMPDEF);
-
- /* Make sure view register and PMU support do match */
- if (kvm_vcpu_has_pmu(vcpu) != valid_pmu)
- return -EINVAL;
-
- /* We can only differ with PerfMon, and anything else is an error */
- val ^= read_id_reg(vcpu, rd);
- val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon);
- if (val)
- return -EINVAL;
-
- if (valid_pmu)
- vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon);
- else
- vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon);
-
- return 0;
+ return set_id_reg(vcpu, rd, val);
}
/*
@@ -1479,18 +1564,60 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 *val)
{
+ /*
+ * Avoid locking if the VM has already started, as the ID registers are
+ * guaranteed to be invariant at that point.
+ */
+ if (kvm_vm_has_ran_once(vcpu->kvm)) {
+ *val = read_id_reg(vcpu, rd);
+ return 0;
+ }
+
+ mutex_lock(&vcpu->kvm->arch.config_lock);
*val = read_id_reg(vcpu, rd);
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
return 0;
}
static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
u64 val)
{
- /* This is what we mean by invariant: you can't change it. */
- if (val != read_id_reg(vcpu, rd))
- return -EINVAL;
+ u32 id = reg_to_encoding(rd);
+ int ret;
- return 0;
+ mutex_lock(&vcpu->kvm->arch.config_lock);
+
+ /*
+ * Once the VM has started the ID registers are immutable. Reject any
+ * write that does not match the final register value.
+ */
+ if (kvm_vm_has_ran_once(vcpu->kvm)) {
+ if (val != read_id_reg(vcpu, rd))
+ ret = -EBUSY;
+ else
+ ret = 0;
+
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+ return ret;
+ }
+
+ ret = arm64_check_features(vcpu, rd, val);
+ if (!ret)
+ IDREG(vcpu->kvm, id) = val;
+
+ mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+ /*
+ * arm64_check_features() returns -E2BIG to indicate the register's
+ * feature set is a superset of the maximally-allowed register value.
+ * While it would be nice to precisely describe this to userspace, the
+ * existing UAPI for KVM_SET_ONE_REG has it that invalid register
+ * writes return -EINVAL.
+ */
+ if (ret == -E2BIG)
+ ret = -EINVAL;
+ return ret;
}
static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
@@ -1530,7 +1657,7 @@ static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
* Fabricate a CLIDR_EL1 value instead of using the real value, which can vary
* by the physical CPU which the vcpu currently resides in.
*/
-static void reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0);
u64 clidr;
@@ -1578,6 +1705,8 @@ static void reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
clidr |= 2 << CLIDR_TTYPE_SHIFT(loc);
__vcpu_sys_reg(vcpu, r->reg) = clidr;
+
+ return __vcpu_sys_reg(vcpu, r->reg);
}
static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
@@ -1677,6 +1806,17 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.visibility = elx2_visibility, \
}
+/*
+ * Since reset() callback and field val are not used for idregs, they will be
+ * used for specific purposes for idregs.
+ * The reset() would return KVM sanitised register value. The value would be the
+ * same as the host kernel sanitised value if there is no KVM sanitisation.
+ * The val would be used as a mask indicating writable fields for the idreg.
+ * Only bits with 1 are writable from userspace. This mask might not be
+ * necessary in the future whenever all ID registers are enabled as writable
+ * from userspace.
+ */
+
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define ID_SANITISED(name) { \
SYS_DESC(SYS_##name), \
@@ -1684,6 +1824,8 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.get_user = get_id_reg, \
.set_user = set_id_reg, \
.visibility = id_visibility, \
+ .reset = kvm_read_sanitised_id_reg, \
+ .val = 0, \
}
/* sys_reg_desc initialiser for known cpufeature ID registers */
@@ -1693,6 +1835,8 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.get_user = get_id_reg, \
.set_user = set_id_reg, \
.visibility = aa32_id_visibility, \
+ .reset = kvm_read_sanitised_id_reg, \
+ .val = 0, \
}
/*
@@ -1705,7 +1849,9 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.access = access_id_reg, \
.get_user = get_id_reg, \
.set_user = set_id_reg, \
- .visibility = raz_visibility \
+ .visibility = raz_visibility, \
+ .reset = kvm_read_sanitised_id_reg, \
+ .val = 0, \
}
/*
@@ -1719,6 +1865,8 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
.get_user = get_id_reg, \
.set_user = set_id_reg, \
.visibility = raz_visibility, \
+ .reset = kvm_read_sanitised_id_reg, \
+ .val = 0, \
}
static bool access_sp_el1(struct kvm_vcpu *vcpu,
@@ -1826,9 +1974,13 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* CRm=1 */
AA32_ID_SANITISED(ID_PFR0_EL1),
AA32_ID_SANITISED(ID_PFR1_EL1),
- { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg,
- .get_user = get_id_reg, .set_user = set_id_dfr0_el1,
- .visibility = aa32_id_visibility, },
+ { SYS_DESC(SYS_ID_DFR0_EL1),
+ .access = access_id_reg,
+ .get_user = get_id_reg,
+ .set_user = set_id_dfr0_el1,
+ .visibility = aa32_id_visibility,
+ .reset = read_sanitised_id_dfr0_el1,
+ .val = ID_DFR0_EL1_PerfMon_MASK, },
ID_HIDDEN(ID_AFR0_EL1),
AA32_ID_SANITISED(ID_MMFR0_EL1),
AA32_ID_SANITISED(ID_MMFR1_EL1),
@@ -1857,8 +2009,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* AArch64 ID registers */
/* CRm=4 */
- { SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg,
- .get_user = get_id_reg, .set_user = set_id_aa64pfr0_el1, },
+ { SYS_DESC(SYS_ID_AA64PFR0_EL1),
+ .access = access_id_reg,
+ .get_user = get_id_reg,
+ .set_user = set_id_reg,
+ .reset = read_sanitised_id_aa64pfr0_el1,
+ .val = ID_AA64PFR0_EL1_CSV2_MASK | ID_AA64PFR0_EL1_CSV3_MASK, },
ID_SANITISED(ID_AA64PFR1_EL1),
ID_UNALLOCATED(4,2),
ID_UNALLOCATED(4,3),
@@ -1868,8 +2024,12 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_UNALLOCATED(4,7),
/* CRm=5 */
- { SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg,
- .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, },
+ { SYS_DESC(SYS_ID_AA64DFR0_EL1),
+ .access = access_id_reg,
+ .get_user = get_id_reg,
+ .set_user = set_id_aa64dfr0_el1,
+ .reset = read_sanitised_id_aa64dfr0_el1,
+ .val = ID_AA64DFR0_EL1_PMUVer_MASK, },
ID_SANITISED(ID_AA64DFR1_EL1),
ID_UNALLOCATED(5,2),
ID_UNALLOCATED(5,3),
@@ -2203,7 +2363,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
EL2_REG(HCR_EL2, access_rw, reset_val, 0),
EL2_REG(MDCR_EL2, access_rw, reset_val, 0),
- EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_EL2_DEFAULT ),
+ EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1),
EL2_REG(HSTR_EL2, access_rw, reset_val, 0),
EL2_REG(HACR_EL2, access_rw, reset_val, 0),
@@ -2260,6 +2420,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
EL2_REG(SP_EL2, NULL, reset_unknown, 0),
};
+static const struct sys_reg_desc *first_idreg;
+
static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -2950,6 +3112,28 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu,
return false;
}
+static void kvm_reset_id_regs(struct kvm_vcpu *vcpu)
+{
+ const struct sys_reg_desc *idreg = first_idreg;
+ u32 id = reg_to_encoding(idreg);
+ struct kvm *kvm = vcpu->kvm;
+
+ if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
+ return;
+
+ lockdep_assert_held(&kvm->arch.config_lock);
+
+ /* Initialize all idregs */
+ while (is_id_reg(id)) {
+ IDREG(kvm, id) = idreg->reset(vcpu, idreg);
+
+ idreg++;
+ id = reg_to_encoding(idreg);
+ }
+
+ set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);
+}
+
/**
* kvm_reset_sys_regs - sets system registers to reset value
* @vcpu: The VCPU pointer
@@ -2961,9 +3145,17 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
{
unsigned long i;
- for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++)
- if (sys_reg_descs[i].reset)
- sys_reg_descs[i].reset(vcpu, &sys_reg_descs[i]);
+ kvm_reset_id_regs(vcpu);
+
+ for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
+ const struct sys_reg_desc *r = &sys_reg_descs[i];
+
+ if (is_id_reg(reg_to_encoding(r)))
+ continue;
+
+ if (r->reset)
+ r->reset(vcpu, r);
+ }
}
/**
@@ -3064,19 +3256,21 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id,
*/
#define FUNCTION_INVARIANT(reg) \
- static void get_##reg(struct kvm_vcpu *v, \
+ static u64 get_##reg(struct kvm_vcpu *v, \
const struct sys_reg_desc *r) \
{ \
((struct sys_reg_desc *)r)->val = read_sysreg(reg); \
+ return ((struct sys_reg_desc *)r)->val; \
}
FUNCTION_INVARIANT(midr_el1)
FUNCTION_INVARIANT(revidr_el1)
FUNCTION_INVARIANT(aidr_el1)
-static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
+static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
{
((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ return ((struct sys_reg_desc *)r)->val;
}
/* ->val is filled in by kvm_sys_reg_table_init() */
@@ -3368,6 +3562,7 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
int __init kvm_sys_reg_table_init(void)
{
+ struct sys_reg_params params;
bool valid = true;
unsigned int i;
@@ -3386,5 +3581,11 @@ int __init kvm_sys_reg_table_init(void)
for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]);
+ /* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */
+ params = encoding_to_params(SYS_ID_PFR0_EL1);
+ first_idreg = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
+ if (!first_idreg)
+ return -EINVAL;
+
return 0;
}
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index 6b11f2cc7146..c65c129b3500 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -27,6 +27,13 @@ struct sys_reg_params {
bool is_write;
};
+#define encoding_to_params(reg) \
+ ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \
+ .Op1 = sys_reg_Op1(reg), \
+ .CRn = sys_reg_CRn(reg), \
+ .CRm = sys_reg_CRm(reg), \
+ .Op2 = sys_reg_Op2(reg) })
+
#define esr_sys64_to_params(esr) \
((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \
.Op1 = ((esr) >> 14) & 0x7, \
@@ -64,13 +71,16 @@ struct sys_reg_desc {
struct sys_reg_params *,
const struct sys_reg_desc *);
- /* Initialization for vcpu. */
- void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
+ /*
+ * Initialization for vcpu. Return initialized value, or KVM
+ * sanitized value for ID registers.
+ */
+ u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
/* Index into sys_reg[], or 0 if we don't need to save it. */
int reg;
- /* Value (usually reset value) */
+ /* Value (usually reset value), or write mask for idregs */
u64 val;
/* Custom get/set_user functions, fallback to generic if NULL */
@@ -123,19 +133,21 @@ static inline bool read_zero(struct kvm_vcpu *vcpu,
}
/* Reset functions */
-static inline void reset_unknown(struct kvm_vcpu *vcpu,
+static inline u64 reset_unknown(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
__vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
+ return __vcpu_sys_reg(vcpu, r->reg);
}
-static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
__vcpu_sys_reg(vcpu, r->reg) = r->val;
+ return __vcpu_sys_reg(vcpu, r->reg);
}
static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 19c23c4fa2da..c80ed4f3cbce 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -25,6 +25,7 @@ HAS_E0PD
HAS_ECV
HAS_ECV_CNTPOFF
HAS_EPAN
+HAS_EVT
HAS_GENERIC_AUTH
HAS_GENERIC_AUTH_ARCH_QARMA3
HAS_GENERIC_AUTH_ARCH_QARMA5
@@ -51,6 +52,7 @@ HAS_TLB_RANGE
HAS_VIRT_HOST_EXTN
HAS_WFXT
HW_DBM
+KVM_HVHE
KVM_PROTECTED_MODE
MISMATCHED_CACHE_TYPE
MTE
@@ -81,6 +83,7 @@ WORKAROUND_2077057
WORKAROUND_2457168
WORKAROUND_2645198
WORKAROUND_2658417
+WORKAROUND_AMPERE_AC03_CPU_38
WORKAROUND_TRBE_OVERWRITE_FILL_MODE
WORKAROUND_TSB_FLUSH_FAILURE
WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index b98b3b6c9da2..7bac43a3176e 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -90,7 +90,9 @@
#define EXC_INST_ACCESS 1
#define EXC_INST_ILLEGAL 2
#define EXC_BREAKPOINT 3
+#define EXC_LOAD_MISALIGNED 4
#define EXC_LOAD_ACCESS 5
+#define EXC_STORE_MISALIGNED 6
#define EXC_STORE_ACCESS 7
#define EXC_SYSCALL 8
#define EXC_HYPERVISOR_SYSCALL 9
diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h
index 1de0717112e5..1f37b600ca47 100644
--- a/arch/riscv/include/asm/kvm_aia.h
+++ b/arch/riscv/include/asm/kvm_aia.h
@@ -20,6 +20,33 @@ struct kvm_aia {
/* In-kernel irqchip initialized */
bool initialized;
+
+ /* Virtualization mode (Emulation, HW Accelerated, or Auto) */
+ u32 mode;
+
+ /* Number of MSIs */
+ u32 nr_ids;
+
+ /* Number of wired IRQs */
+ u32 nr_sources;
+
+ /* Number of group bits in IMSIC address */
+ u32 nr_group_bits;
+
+ /* Position of group bits in IMSIC address */
+ u32 nr_group_shift;
+
+ /* Number of hart bits in IMSIC address */
+ u32 nr_hart_bits;
+
+ /* Number of guest bits in IMSIC address */
+ u32 nr_guest_bits;
+
+ /* Guest physical address of APLIC */
+ gpa_t aplic_addr;
+
+ /* Internal state of APLIC */
+ void *aplic_state;
};
struct kvm_vcpu_aia_csr {
@@ -38,25 +65,53 @@ struct kvm_vcpu_aia {
/* CPU AIA CSR context upon Guest VCPU reset */
struct kvm_vcpu_aia_csr guest_reset_csr;
+
+ /* Guest physical address of IMSIC for this VCPU */
+ gpa_t imsic_addr;
+
+ /* HART index of IMSIC extacted from guest physical address */
+ u32 hart_index;
+
+ /* Internal state of IMSIC for this VCPU */
+ void *imsic_state;
};
+#define KVM_RISCV_AIA_UNDEF_ADDR (-1)
+
#define kvm_riscv_aia_initialized(k) ((k)->arch.aia.initialized)
#define irqchip_in_kernel(k) ((k)->arch.aia.in_kernel)
+extern unsigned int kvm_riscv_aia_nr_hgei;
+extern unsigned int kvm_riscv_aia_max_ids;
DECLARE_STATIC_KEY_FALSE(kvm_riscv_aia_available);
#define kvm_riscv_aia_available() \
static_branch_unlikely(&kvm_riscv_aia_available)
+extern struct kvm_device_ops kvm_riscv_aia_device_ops;
+
+void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu);
+int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu);
+
#define KVM_RISCV_AIA_IMSIC_TOPEI (ISELECT_MASK + 1)
-static inline int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu,
- unsigned long isel,
- unsigned long *val,
- unsigned long new_val,
- unsigned long wr_mask)
-{
- return 0;
-}
+int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask);
+int kvm_riscv_aia_imsic_rw_attr(struct kvm *kvm, unsigned long type,
+ bool write, unsigned long *val);
+int kvm_riscv_aia_imsic_has_attr(struct kvm *kvm, unsigned long type);
+void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu);
+int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu,
+ u32 guest_index, u32 offset, u32 iid);
+int kvm_riscv_vcpu_aia_imsic_init(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu);
+
+int kvm_riscv_aia_aplic_set_attr(struct kvm *kvm, unsigned long type, u32 v);
+int kvm_riscv_aia_aplic_get_attr(struct kvm *kvm, unsigned long type, u32 *v);
+int kvm_riscv_aia_aplic_has_attr(struct kvm *kvm, unsigned long type);
+int kvm_riscv_aia_aplic_inject(struct kvm *kvm, u32 source, bool level);
+int kvm_riscv_aia_aplic_init(struct kvm *kvm);
+void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm);
#ifdef CONFIG_32BIT
void kvm_riscv_vcpu_aia_flush_interrupts(struct kvm_vcpu *vcpu);
@@ -93,31 +148,23 @@ int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num,
{ .base = CSR_SIREG, .count = 1, .func = kvm_riscv_vcpu_aia_rmw_ireg }, \
{ .base = CSR_STOPEI, .count = 1, .func = kvm_riscv_vcpu_aia_rmw_topei },
-static inline int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu)
-{
- return 1;
-}
-
-static inline void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu)
-{
-}
-
-static inline int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu)
-{
- return 0;
-}
+int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu);
+int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu);
-static inline void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu)
-{
-}
+int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index,
+ u32 guest_index, u32 iid);
+int kvm_riscv_aia_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
+int kvm_riscv_aia_inject_irq(struct kvm *kvm, unsigned int irq, bool level);
-static inline void kvm_riscv_aia_init_vm(struct kvm *kvm)
-{
-}
+void kvm_riscv_aia_init_vm(struct kvm *kvm);
+void kvm_riscv_aia_destroy_vm(struct kvm *kvm);
-static inline void kvm_riscv_aia_destroy_vm(struct kvm *kvm)
-{
-}
+int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner,
+ void __iomem **hgei_va, phys_addr_t *hgei_pa);
+void kvm_riscv_aia_free_hgei(int cpu, int hgei);
+void kvm_riscv_aia_wakeon_hgei(struct kvm_vcpu *owner, bool enable);
void kvm_riscv_aia_enable(void);
void kvm_riscv_aia_disable(void);
diff --git a/arch/riscv/include/asm/kvm_aia_aplic.h b/arch/riscv/include/asm/kvm_aia_aplic.h
new file mode 100644
index 000000000000..6dd1a4809ec1
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_aia_aplic.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+#ifndef __KVM_RISCV_AIA_IMSIC_H
+#define __KVM_RISCV_AIA_IMSIC_H
+
+#include <linux/bitops.h>
+
+#define APLIC_MAX_IDC BIT(14)
+#define APLIC_MAX_SOURCE 1024
+
+#define APLIC_DOMAINCFG 0x0000
+#define APLIC_DOMAINCFG_RDONLY 0x80000000
+#define APLIC_DOMAINCFG_IE BIT(8)
+#define APLIC_DOMAINCFG_DM BIT(2)
+#define APLIC_DOMAINCFG_BE BIT(0)
+
+#define APLIC_SOURCECFG_BASE 0x0004
+#define APLIC_SOURCECFG_D BIT(10)
+#define APLIC_SOURCECFG_CHILDIDX_MASK 0x000003ff
+#define APLIC_SOURCECFG_SM_MASK 0x00000007
+#define APLIC_SOURCECFG_SM_INACTIVE 0x0
+#define APLIC_SOURCECFG_SM_DETACH 0x1
+#define APLIC_SOURCECFG_SM_EDGE_RISE 0x4
+#define APLIC_SOURCECFG_SM_EDGE_FALL 0x5
+#define APLIC_SOURCECFG_SM_LEVEL_HIGH 0x6
+#define APLIC_SOURCECFG_SM_LEVEL_LOW 0x7
+
+#define APLIC_IRQBITS_PER_REG 32
+
+#define APLIC_SETIP_BASE 0x1c00
+#define APLIC_SETIPNUM 0x1cdc
+
+#define APLIC_CLRIP_BASE 0x1d00
+#define APLIC_CLRIPNUM 0x1ddc
+
+#define APLIC_SETIE_BASE 0x1e00
+#define APLIC_SETIENUM 0x1edc
+
+#define APLIC_CLRIE_BASE 0x1f00
+#define APLIC_CLRIENUM 0x1fdc
+
+#define APLIC_SETIPNUM_LE 0x2000
+#define APLIC_SETIPNUM_BE 0x2004
+
+#define APLIC_GENMSI 0x3000
+
+#define APLIC_TARGET_BASE 0x3004
+#define APLIC_TARGET_HART_IDX_SHIFT 18
+#define APLIC_TARGET_HART_IDX_MASK 0x3fff
+#define APLIC_TARGET_GUEST_IDX_SHIFT 12
+#define APLIC_TARGET_GUEST_IDX_MASK 0x3f
+#define APLIC_TARGET_IPRIO_MASK 0xff
+#define APLIC_TARGET_EIID_MASK 0x7ff
+
+#endif
diff --git a/arch/riscv/include/asm/kvm_aia_imsic.h b/arch/riscv/include/asm/kvm_aia_imsic.h
new file mode 100644
index 000000000000..da5881d2bde0
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_aia_imsic.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+#ifndef __KVM_RISCV_AIA_IMSIC_H
+#define __KVM_RISCV_AIA_IMSIC_H
+
+#include <linux/types.h>
+#include <asm/csr.h>
+
+#define IMSIC_MMIO_PAGE_SHIFT 12
+#define IMSIC_MMIO_PAGE_SZ (1UL << IMSIC_MMIO_PAGE_SHIFT)
+#define IMSIC_MMIO_PAGE_LE 0x00
+#define IMSIC_MMIO_PAGE_BE 0x04
+
+#define IMSIC_MIN_ID 63
+#define IMSIC_MAX_ID 2048
+
+#define IMSIC_EIDELIVERY 0x70
+
+#define IMSIC_EITHRESHOLD 0x72
+
+#define IMSIC_EIP0 0x80
+#define IMSIC_EIP63 0xbf
+#define IMSIC_EIPx_BITS 32
+
+#define IMSIC_EIE0 0xc0
+#define IMSIC_EIE63 0xff
+#define IMSIC_EIEx_BITS 32
+
+#define IMSIC_FIRST IMSIC_EIDELIVERY
+#define IMSIC_LAST IMSIC_EIE63
+
+#define IMSIC_MMIO_SETIPNUM_LE 0x00
+#define IMSIC_MMIO_SETIPNUM_BE 0x04
+
+#endif
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index bd47a1dc2ff8..2d8ee53b66c7 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -28,6 +28,8 @@
#define KVM_VCPU_MAX_FEATURES 0
+#define KVM_IRQCHIP_NUM_PINS 1024
+
#define KVM_REQ_SLEEP \
KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(1)
@@ -320,6 +322,8 @@ int kvm_riscv_gstage_vmid_init(struct kvm *kvm);
bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid);
void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu);
+int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines);
+
void __kvm_riscv_unpriv_trap(void);
unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu,
diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h
index 4278125a38a5..cdcf0ff07be7 100644
--- a/arch/riscv/include/asm/kvm_vcpu_sbi.h
+++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h
@@ -14,9 +14,15 @@
#define KVM_SBI_VERSION_MAJOR 1
#define KVM_SBI_VERSION_MINOR 0
+enum kvm_riscv_sbi_ext_status {
+ KVM_RISCV_SBI_EXT_UNINITIALIZED,
+ KVM_RISCV_SBI_EXT_AVAILABLE,
+ KVM_RISCV_SBI_EXT_UNAVAILABLE,
+};
+
struct kvm_vcpu_sbi_context {
int return_handled;
- bool extension_disabled[KVM_RISCV_SBI_EXT_MAX];
+ enum kvm_riscv_sbi_ext_status ext_status[KVM_RISCV_SBI_EXT_MAX];
};
struct kvm_vcpu_sbi_return {
@@ -66,4 +72,7 @@ extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm;
extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental;
extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor;
+#ifdef CONFIG_RISCV_PMU_SBI
+extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu;
+#endif
#endif /* __RISCV_KVM_VCPU_SBI_H__ */
diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h
index 855c047e86d4..930fdc4101cd 100644
--- a/arch/riscv/include/uapi/asm/kvm.h
+++ b/arch/riscv/include/uapi/asm/kvm.h
@@ -15,6 +15,7 @@
#include <asm/bitsperlong.h>
#include <asm/ptrace.h>
+#define __KVM_HAVE_IRQ_LINE
#define __KVM_HAVE_READONLY_MEM
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
@@ -122,6 +123,7 @@ enum KVM_RISCV_ISA_EXT_ID {
KVM_RISCV_ISA_EXT_ZBB,
KVM_RISCV_ISA_EXT_SSAIA,
KVM_RISCV_ISA_EXT_V,
+ KVM_RISCV_ISA_EXT_SVNAPOT,
KVM_RISCV_ISA_EXT_MAX,
};
@@ -211,6 +213,77 @@ enum KVM_RISCV_SBI_EXT_ID {
#define KVM_REG_RISCV_VECTOR_REG(n) \
((n) + sizeof(struct __riscv_v_ext_state) / sizeof(unsigned long))
+/* Device Control API: RISC-V AIA */
+#define KVM_DEV_RISCV_APLIC_ALIGN 0x1000
+#define KVM_DEV_RISCV_APLIC_SIZE 0x4000
+#define KVM_DEV_RISCV_APLIC_MAX_HARTS 0x4000
+#define KVM_DEV_RISCV_IMSIC_ALIGN 0x1000
+#define KVM_DEV_RISCV_IMSIC_SIZE 0x1000
+
+#define KVM_DEV_RISCV_AIA_GRP_CONFIG 0
+#define KVM_DEV_RISCV_AIA_CONFIG_MODE 0
+#define KVM_DEV_RISCV_AIA_CONFIG_IDS 1
+#define KVM_DEV_RISCV_AIA_CONFIG_SRCS 2
+#define KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS 3
+#define KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT 4
+#define KVM_DEV_RISCV_AIA_CONFIG_HART_BITS 5
+#define KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS 6
+
+/*
+ * Modes of RISC-V AIA device:
+ * 1) EMUL (aka Emulation): Trap-n-emulate IMSIC
+ * 2) HWACCEL (aka HW Acceleration): Virtualize IMSIC using IMSIC guest files
+ * 3) AUTO (aka Automatic): Virtualize IMSIC using IMSIC guest files whenever
+ * available otherwise fallback to trap-n-emulation
+ */
+#define KVM_DEV_RISCV_AIA_MODE_EMUL 0
+#define KVM_DEV_RISCV_AIA_MODE_HWACCEL 1
+#define KVM_DEV_RISCV_AIA_MODE_AUTO 2
+
+#define KVM_DEV_RISCV_AIA_IDS_MIN 63
+#define KVM_DEV_RISCV_AIA_IDS_MAX 2048
+#define KVM_DEV_RISCV_AIA_SRCS_MAX 1024
+#define KVM_DEV_RISCV_AIA_GROUP_BITS_MAX 8
+#define KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN 24
+#define KVM_DEV_RISCV_AIA_GROUP_SHIFT_MAX 56
+#define KVM_DEV_RISCV_AIA_HART_BITS_MAX 16
+#define KVM_DEV_RISCV_AIA_GUEST_BITS_MAX 8
+
+#define KVM_DEV_RISCV_AIA_GRP_ADDR 1
+#define KVM_DEV_RISCV_AIA_ADDR_APLIC 0
+#define KVM_DEV_RISCV_AIA_ADDR_IMSIC(__vcpu) (1 + (__vcpu))
+#define KVM_DEV_RISCV_AIA_ADDR_MAX \
+ (1 + KVM_DEV_RISCV_APLIC_MAX_HARTS)
+
+#define KVM_DEV_RISCV_AIA_GRP_CTRL 2
+#define KVM_DEV_RISCV_AIA_CTRL_INIT 0
+
+/*
+ * The device attribute type contains the memory mapped offset of the
+ * APLIC register (range 0x0000-0x3FFF) and it must be 4-byte aligned.
+ */
+#define KVM_DEV_RISCV_AIA_GRP_APLIC 3
+
+/*
+ * The lower 12-bits of the device attribute type contains the iselect
+ * value of the IMSIC register (range 0x70-0xFF) whereas the higher order
+ * bits contains the VCPU id.
+ */
+#define KVM_DEV_RISCV_AIA_GRP_IMSIC 4
+#define KVM_DEV_RISCV_AIA_IMSIC_ISEL_BITS 12
+#define KVM_DEV_RISCV_AIA_IMSIC_ISEL_MASK \
+ ((1U << KVM_DEV_RISCV_AIA_IMSIC_ISEL_BITS) - 1)
+#define KVM_DEV_RISCV_AIA_IMSIC_MKATTR(__vcpu, __isel) \
+ (((__vcpu) << KVM_DEV_RISCV_AIA_IMSIC_ISEL_BITS) | \
+ ((__isel) & KVM_DEV_RISCV_AIA_IMSIC_ISEL_MASK))
+#define KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(__attr) \
+ ((__attr) & KVM_DEV_RISCV_AIA_IMSIC_ISEL_MASK)
+#define KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(__attr) \
+ ((__attr) >> KVM_DEV_RISCV_AIA_IMSIC_ISEL_BITS)
+
+/* One single KVM irqchip, ie. the AIA */
+#define KVM_NR_IRQCHIPS 1
+
#endif
#endif /* __LINUX_KVM_RISCV_H */
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index 28891e583259..dfc237d7875b 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -21,6 +21,10 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
depends on RISCV_SBI && MMU
select HAVE_KVM_EVENTFD
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQFD
+ select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_MSI
select HAVE_KVM_VCPU_ASYNC_IOCTL
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_GENERIC_HARDWARE_ENABLING
diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile
index 7b4c21f9aa6a..fee0671e2dc1 100644
--- a/arch/riscv/kvm/Makefile
+++ b/arch/riscv/kvm/Makefile
@@ -28,3 +28,6 @@ kvm-y += vcpu_sbi_hsm.o
kvm-y += vcpu_timer.o
kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o
kvm-y += aia.o
+kvm-y += aia_device.o
+kvm-y += aia_aplic.o
+kvm-y += aia_imsic.o
diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c
index 4f1286fc7f17..585a3b42c52c 100644
--- a/arch/riscv/kvm/aia.c
+++ b/arch/riscv/kvm/aia.c
@@ -8,11 +8,49 @@
*/
#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
#include <linux/kvm_host.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
#include <asm/hwcap.h>
+#include <asm/kvm_aia_imsic.h>
+struct aia_hgei_control {
+ raw_spinlock_t lock;
+ unsigned long free_bitmap;
+ struct kvm_vcpu *owners[BITS_PER_LONG];
+};
+static DEFINE_PER_CPU(struct aia_hgei_control, aia_hgei);
+static int hgei_parent_irq;
+
+unsigned int kvm_riscv_aia_nr_hgei;
+unsigned int kvm_riscv_aia_max_ids;
DEFINE_STATIC_KEY_FALSE(kvm_riscv_aia_available);
+static int aia_find_hgei(struct kvm_vcpu *owner)
+{
+ int i, hgei;
+ unsigned long flags;
+ struct aia_hgei_control *hgctrl = get_cpu_ptr(&aia_hgei);
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ hgei = -1;
+ for (i = 1; i <= kvm_riscv_aia_nr_hgei; i++) {
+ if (hgctrl->owners[i] == owner) {
+ hgei = i;
+ break;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ put_cpu_ptr(&aia_hgei);
+ return hgei;
+}
+
static void aia_set_hvictl(bool ext_irq_pending)
{
unsigned long hvictl;
@@ -56,6 +94,7 @@ void kvm_riscv_vcpu_aia_sync_interrupts(struct kvm_vcpu *vcpu)
bool kvm_riscv_vcpu_aia_has_interrupts(struct kvm_vcpu *vcpu, u64 mask)
{
+ int hgei;
unsigned long seip;
if (!kvm_riscv_aia_available())
@@ -74,6 +113,10 @@ bool kvm_riscv_vcpu_aia_has_interrupts(struct kvm_vcpu *vcpu, u64 mask)
if (!kvm_riscv_aia_initialized(vcpu->kvm) || !seip)
return false;
+ hgei = aia_find_hgei(vcpu);
+ if (hgei > 0)
+ return !!(csr_read(CSR_HGEIP) & BIT(hgei));
+
return false;
}
@@ -323,8 +366,6 @@ static int aia_rmw_iprio(struct kvm_vcpu *vcpu, unsigned int isel,
return KVM_INSN_CONTINUE_NEXT_SEPC;
}
-#define IMSIC_FIRST 0x70
-#define IMSIC_LAST 0xff
int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num,
unsigned long *val, unsigned long new_val,
unsigned long wr_mask)
@@ -348,6 +389,143 @@ int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num,
return KVM_INSN_EXIT_TO_USER_SPACE;
}
+int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner,
+ void __iomem **hgei_va, phys_addr_t *hgei_pa)
+{
+ int ret = -ENOENT;
+ unsigned long flags;
+ struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu);
+
+ if (!kvm_riscv_aia_available() || !hgctrl)
+ return -ENODEV;
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ if (hgctrl->free_bitmap) {
+ ret = __ffs(hgctrl->free_bitmap);
+ hgctrl->free_bitmap &= ~BIT(ret);
+ hgctrl->owners[ret] = owner;
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ /* TODO: To be updated later by AIA IMSIC HW guest file support */
+ if (hgei_va)
+ *hgei_va = NULL;
+ if (hgei_pa)
+ *hgei_pa = 0;
+
+ return ret;
+}
+
+void kvm_riscv_aia_free_hgei(int cpu, int hgei)
+{
+ unsigned long flags;
+ struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu);
+
+ if (!kvm_riscv_aia_available() || !hgctrl)
+ return;
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ if (hgei > 0 && hgei <= kvm_riscv_aia_nr_hgei) {
+ if (!(hgctrl->free_bitmap & BIT(hgei))) {
+ hgctrl->free_bitmap |= BIT(hgei);
+ hgctrl->owners[hgei] = NULL;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+}
+
+void kvm_riscv_aia_wakeon_hgei(struct kvm_vcpu *owner, bool enable)
+{
+ int hgei;
+
+ if (!kvm_riscv_aia_available())
+ return;
+
+ hgei = aia_find_hgei(owner);
+ if (hgei > 0) {
+ if (enable)
+ csr_set(CSR_HGEIE, BIT(hgei));
+ else
+ csr_clear(CSR_HGEIE, BIT(hgei));
+ }
+}
+
+static irqreturn_t hgei_interrupt(int irq, void *dev_id)
+{
+ int i;
+ unsigned long hgei_mask, flags;
+ struct aia_hgei_control *hgctrl = get_cpu_ptr(&aia_hgei);
+
+ hgei_mask = csr_read(CSR_HGEIP) & csr_read(CSR_HGEIE);
+ csr_clear(CSR_HGEIE, hgei_mask);
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ for_each_set_bit(i, &hgei_mask, BITS_PER_LONG) {
+ if (hgctrl->owners[i])
+ kvm_vcpu_kick(hgctrl->owners[i]);
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ put_cpu_ptr(&aia_hgei);
+ return IRQ_HANDLED;
+}
+
+static int aia_hgei_init(void)
+{
+ int cpu, rc;
+ struct irq_domain *domain;
+ struct aia_hgei_control *hgctrl;
+
+ /* Initialize per-CPU guest external interrupt line management */
+ for_each_possible_cpu(cpu) {
+ hgctrl = per_cpu_ptr(&aia_hgei, cpu);
+ raw_spin_lock_init(&hgctrl->lock);
+ if (kvm_riscv_aia_nr_hgei) {
+ hgctrl->free_bitmap =
+ BIT(kvm_riscv_aia_nr_hgei + 1) - 1;
+ hgctrl->free_bitmap &= ~BIT(0);
+ } else
+ hgctrl->free_bitmap = 0;
+ }
+
+ /* Find INTC irq domain */
+ domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(),
+ DOMAIN_BUS_ANY);
+ if (!domain) {
+ kvm_err("unable to find INTC domain\n");
+ return -ENOENT;
+ }
+
+ /* Map per-CPU SGEI interrupt from INTC domain */
+ hgei_parent_irq = irq_create_mapping(domain, IRQ_S_GEXT);
+ if (!hgei_parent_irq) {
+ kvm_err("unable to map SGEI IRQ\n");
+ return -ENOMEM;
+ }
+
+ /* Request per-CPU SGEI interrupt */
+ rc = request_percpu_irq(hgei_parent_irq, hgei_interrupt,
+ "riscv-kvm", &aia_hgei);
+ if (rc) {
+ kvm_err("failed to request SGEI IRQ\n");
+ return rc;
+ }
+
+ return 0;
+}
+
+static void aia_hgei_exit(void)
+{
+ /* Free per-CPU SGEI interrupt */
+ free_percpu_irq(hgei_parent_irq, &aia_hgei);
+}
+
void kvm_riscv_aia_enable(void)
{
if (!kvm_riscv_aia_available())
@@ -362,21 +540,105 @@ void kvm_riscv_aia_enable(void)
csr_write(CSR_HVIPRIO1H, 0x0);
csr_write(CSR_HVIPRIO2H, 0x0);
#endif
+
+ /* Enable per-CPU SGEI interrupt */
+ enable_percpu_irq(hgei_parent_irq,
+ irq_get_trigger_type(hgei_parent_irq));
+ csr_set(CSR_HIE, BIT(IRQ_S_GEXT));
}
void kvm_riscv_aia_disable(void)
{
+ int i;
+ unsigned long flags;
+ struct kvm_vcpu *vcpu;
+ struct aia_hgei_control *hgctrl;
+
if (!kvm_riscv_aia_available())
return;
+ hgctrl = get_cpu_ptr(&aia_hgei);
+
+ /* Disable per-CPU SGEI interrupt */
+ csr_clear(CSR_HIE, BIT(IRQ_S_GEXT));
+ disable_percpu_irq(hgei_parent_irq);
aia_set_hvictl(false);
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+
+ for (i = 0; i <= kvm_riscv_aia_nr_hgei; i++) {
+ vcpu = hgctrl->owners[i];
+ if (!vcpu)
+ continue;
+
+ /*
+ * We release hgctrl->lock before notifying IMSIC
+ * so that we don't have lock ordering issues.
+ */
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ /* Notify IMSIC */
+ kvm_riscv_vcpu_aia_imsic_release(vcpu);
+
+ /*
+ * Wakeup VCPU if it was blocked so that it can
+ * run on other HARTs
+ */
+ if (csr_read(CSR_HGEIE) & BIT(i)) {
+ csr_clear(CSR_HGEIE, BIT(i));
+ kvm_vcpu_kick(vcpu);
+ }
+
+ raw_spin_lock_irqsave(&hgctrl->lock, flags);
+ }
+
+ raw_spin_unlock_irqrestore(&hgctrl->lock, flags);
+
+ put_cpu_ptr(&aia_hgei);
}
int kvm_riscv_aia_init(void)
{
+ int rc;
+
if (!riscv_isa_extension_available(NULL, SxAIA))
return -ENODEV;
+ /* Figure-out number of bits in HGEIE */
+ csr_write(CSR_HGEIE, -1UL);
+ kvm_riscv_aia_nr_hgei = fls_long(csr_read(CSR_HGEIE));
+ csr_write(CSR_HGEIE, 0);
+ if (kvm_riscv_aia_nr_hgei)
+ kvm_riscv_aia_nr_hgei--;
+
+ /*
+ * Number of usable HGEI lines should be minimum of per-HART
+ * IMSIC guest files and number of bits in HGEIE
+ *
+ * TODO: To be updated later by AIA IMSIC HW guest file support
+ */
+ kvm_riscv_aia_nr_hgei = 0;
+
+ /*
+ * Find number of guest MSI IDs
+ *
+ * TODO: To be updated later by AIA IMSIC HW guest file support
+ */
+ kvm_riscv_aia_max_ids = IMSIC_MAX_ID;
+
+ /* Initialize guest external interrupt line management */
+ rc = aia_hgei_init();
+ if (rc)
+ return rc;
+
+ /* Register device operations */
+ rc = kvm_register_device_ops(&kvm_riscv_aia_device_ops,
+ KVM_DEV_TYPE_RISCV_AIA);
+ if (rc) {
+ aia_hgei_exit();
+ return rc;
+ }
+
/* Enable KVM AIA support */
static_branch_enable(&kvm_riscv_aia_available);
@@ -385,4 +647,12 @@ int kvm_riscv_aia_init(void)
void kvm_riscv_aia_exit(void)
{
+ if (!kvm_riscv_aia_available())
+ return;
+
+ /* Unregister device operations */
+ kvm_unregister_device_ops(KVM_DEV_TYPE_RISCV_AIA);
+
+ /* Cleanup the HGEI state */
+ aia_hgei_exit();
}
diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c
new file mode 100644
index 000000000000..39e72aa016a4
--- /dev/null
+++ b/arch/riscv/kvm/aia_aplic.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ *
+ * Authors:
+ * Anup Patel <apatel@ventanamicro.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/math.h>
+#include <linux/spinlock.h>
+#include <linux/swab.h>
+#include <kvm/iodev.h>
+#include <asm/kvm_aia_aplic.h>
+
+struct aplic_irq {
+ raw_spinlock_t lock;
+ u32 sourcecfg;
+ u32 state;
+#define APLIC_IRQ_STATE_PENDING BIT(0)
+#define APLIC_IRQ_STATE_ENABLED BIT(1)
+#define APLIC_IRQ_STATE_ENPEND (APLIC_IRQ_STATE_PENDING | \
+ APLIC_IRQ_STATE_ENABLED)
+#define APLIC_IRQ_STATE_INPUT BIT(8)
+ u32 target;
+};
+
+struct aplic {
+ struct kvm_io_device iodev;
+
+ u32 domaincfg;
+ u32 genmsi;
+
+ u32 nr_irqs;
+ u32 nr_words;
+ struct aplic_irq *irqs;
+};
+
+static u32 aplic_read_sourcecfg(struct aplic *aplic, u32 irq)
+{
+ u32 ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return 0;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = irqd->sourcecfg;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_write_sourcecfg(struct aplic *aplic, u32 irq, u32 val)
+{
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return;
+ irqd = &aplic->irqs[irq];
+
+ if (val & APLIC_SOURCECFG_D)
+ val = 0;
+ else
+ val &= APLIC_SOURCECFG_SM_MASK;
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ irqd->sourcecfg = val;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+}
+
+static u32 aplic_read_target(struct aplic *aplic, u32 irq)
+{
+ u32 ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return 0;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = irqd->target;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_write_target(struct aplic *aplic, u32 irq, u32 val)
+{
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return;
+ irqd = &aplic->irqs[irq];
+
+ val &= APLIC_TARGET_EIID_MASK |
+ (APLIC_TARGET_HART_IDX_MASK << APLIC_TARGET_HART_IDX_SHIFT) |
+ (APLIC_TARGET_GUEST_IDX_MASK << APLIC_TARGET_GUEST_IDX_SHIFT);
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ irqd->target = val;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+}
+
+static bool aplic_read_pending(struct aplic *aplic, u32 irq)
+{
+ bool ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return false;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = (irqd->state & APLIC_IRQ_STATE_PENDING) ? true : false;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_write_pending(struct aplic *aplic, u32 irq, bool pending)
+{
+ unsigned long flags, sm;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+
+ sm = irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK;
+ if (!pending &&
+ ((sm == APLIC_SOURCECFG_SM_LEVEL_HIGH) ||
+ (sm == APLIC_SOURCECFG_SM_LEVEL_LOW)))
+ goto skip_write_pending;
+
+ if (pending)
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ else
+ irqd->state &= ~APLIC_IRQ_STATE_PENDING;
+
+skip_write_pending:
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+}
+
+static bool aplic_read_enabled(struct aplic *aplic, u32 irq)
+{
+ bool ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return false;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = (irqd->state & APLIC_IRQ_STATE_ENABLED) ? true : false;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_write_enabled(struct aplic *aplic, u32 irq, bool enabled)
+{
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ if (enabled)
+ irqd->state |= APLIC_IRQ_STATE_ENABLED;
+ else
+ irqd->state &= ~APLIC_IRQ_STATE_ENABLED;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+}
+
+static bool aplic_read_input(struct aplic *aplic, u32 irq)
+{
+ bool ret;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+
+ if (!irq || aplic->nr_irqs <= irq)
+ return false;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+ ret = (irqd->state & APLIC_IRQ_STATE_INPUT) ? true : false;
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ return ret;
+}
+
+static void aplic_inject_msi(struct kvm *kvm, u32 irq, u32 target)
+{
+ u32 hart_idx, guest_idx, eiid;
+
+ hart_idx = target >> APLIC_TARGET_HART_IDX_SHIFT;
+ hart_idx &= APLIC_TARGET_HART_IDX_MASK;
+ guest_idx = target >> APLIC_TARGET_GUEST_IDX_SHIFT;
+ guest_idx &= APLIC_TARGET_GUEST_IDX_MASK;
+ eiid = target & APLIC_TARGET_EIID_MASK;
+ kvm_riscv_aia_inject_msi_by_id(kvm, hart_idx, guest_idx, eiid);
+}
+
+static void aplic_update_irq_range(struct kvm *kvm, u32 first, u32 last)
+{
+ bool inject;
+ u32 irq, target;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if (!(aplic->domaincfg & APLIC_DOMAINCFG_IE))
+ return;
+
+ for (irq = first; irq <= last; irq++) {
+ if (!irq || aplic->nr_irqs <= irq)
+ continue;
+ irqd = &aplic->irqs[irq];
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+
+ inject = false;
+ target = irqd->target;
+ if ((irqd->state & APLIC_IRQ_STATE_ENPEND) ==
+ APLIC_IRQ_STATE_ENPEND) {
+ irqd->state &= ~APLIC_IRQ_STATE_PENDING;
+ inject = true;
+ }
+
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ if (inject)
+ aplic_inject_msi(kvm, irq, target);
+ }
+}
+
+int kvm_riscv_aia_aplic_inject(struct kvm *kvm, u32 source, bool level)
+{
+ u32 target;
+ bool inject = false, ie;
+ unsigned long flags;
+ struct aplic_irq *irqd;
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if (!aplic || !source || (aplic->nr_irqs <= source))
+ return -ENODEV;
+ irqd = &aplic->irqs[source];
+ ie = (aplic->domaincfg & APLIC_DOMAINCFG_IE) ? true : false;
+
+ raw_spin_lock_irqsave(&irqd->lock, flags);
+
+ if (irqd->sourcecfg & APLIC_SOURCECFG_D)
+ goto skip_unlock;
+
+ switch (irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK) {
+ case APLIC_SOURCECFG_SM_EDGE_RISE:
+ if (level && !(irqd->state & APLIC_IRQ_STATE_INPUT) &&
+ !(irqd->state & APLIC_IRQ_STATE_PENDING))
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ break;
+ case APLIC_SOURCECFG_SM_EDGE_FALL:
+ if (!level && (irqd->state & APLIC_IRQ_STATE_INPUT) &&
+ !(irqd->state & APLIC_IRQ_STATE_PENDING))
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ break;
+ case APLIC_SOURCECFG_SM_LEVEL_HIGH:
+ if (level && !(irqd->state & APLIC_IRQ_STATE_PENDING))
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ break;
+ case APLIC_SOURCECFG_SM_LEVEL_LOW:
+ if (!level && !(irqd->state & APLIC_IRQ_STATE_PENDING))
+ irqd->state |= APLIC_IRQ_STATE_PENDING;
+ break;
+ }
+
+ if (level)
+ irqd->state |= APLIC_IRQ_STATE_INPUT;
+ else
+ irqd->state &= ~APLIC_IRQ_STATE_INPUT;
+
+ target = irqd->target;
+ if (ie && ((irqd->state & APLIC_IRQ_STATE_ENPEND) ==
+ APLIC_IRQ_STATE_ENPEND)) {
+ irqd->state &= ~APLIC_IRQ_STATE_PENDING;
+ inject = true;
+ }
+
+skip_unlock:
+ raw_spin_unlock_irqrestore(&irqd->lock, flags);
+
+ if (inject)
+ aplic_inject_msi(kvm, source, target);
+
+ return 0;
+}
+
+static u32 aplic_read_input_word(struct aplic *aplic, u32 word)
+{
+ u32 i, ret = 0;
+
+ for (i = 0; i < 32; i++)
+ ret |= aplic_read_input(aplic, word * 32 + i) ? BIT(i) : 0;
+
+ return ret;
+}
+
+static u32 aplic_read_pending_word(struct aplic *aplic, u32 word)
+{
+ u32 i, ret = 0;
+
+ for (i = 0; i < 32; i++)
+ ret |= aplic_read_pending(aplic, word * 32 + i) ? BIT(i) : 0;
+
+ return ret;
+}
+
+static void aplic_write_pending_word(struct aplic *aplic, u32 word,
+ u32 val, bool pending)
+{
+ u32 i;
+
+ for (i = 0; i < 32; i++) {
+ if (val & BIT(i))
+ aplic_write_pending(aplic, word * 32 + i, pending);
+ }
+}
+
+static u32 aplic_read_enabled_word(struct aplic *aplic, u32 word)
+{
+ u32 i, ret = 0;
+
+ for (i = 0; i < 32; i++)
+ ret |= aplic_read_enabled(aplic, word * 32 + i) ? BIT(i) : 0;
+
+ return ret;
+}
+
+static void aplic_write_enabled_word(struct aplic *aplic, u32 word,
+ u32 val, bool enabled)
+{
+ u32 i;
+
+ for (i = 0; i < 32; i++) {
+ if (val & BIT(i))
+ aplic_write_enabled(aplic, word * 32 + i, enabled);
+ }
+}
+
+static int aplic_mmio_read_offset(struct kvm *kvm, gpa_t off, u32 *val32)
+{
+ u32 i;
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if ((off & 0x3) != 0)
+ return -EOPNOTSUPP;
+
+ if (off == APLIC_DOMAINCFG) {
+ *val32 = APLIC_DOMAINCFG_RDONLY |
+ aplic->domaincfg | APLIC_DOMAINCFG_DM;
+ } else if ((off >= APLIC_SOURCECFG_BASE) &&
+ (off < (APLIC_SOURCECFG_BASE + (aplic->nr_irqs - 1) * 4))) {
+ i = ((off - APLIC_SOURCECFG_BASE) >> 2) + 1;
+ *val32 = aplic_read_sourcecfg(aplic, i);
+ } else if ((off >= APLIC_SETIP_BASE) &&
+ (off < (APLIC_SETIP_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_SETIP_BASE) >> 2;
+ *val32 = aplic_read_pending_word(aplic, i);
+ } else if (off == APLIC_SETIPNUM) {
+ *val32 = 0;
+ } else if ((off >= APLIC_CLRIP_BASE) &&
+ (off < (APLIC_CLRIP_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_CLRIP_BASE) >> 2;
+ *val32 = aplic_read_input_word(aplic, i);
+ } else if (off == APLIC_CLRIPNUM) {
+ *val32 = 0;
+ } else if ((off >= APLIC_SETIE_BASE) &&
+ (off < (APLIC_SETIE_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_SETIE_BASE) >> 2;
+ *val32 = aplic_read_enabled_word(aplic, i);
+ } else if (off == APLIC_SETIENUM) {
+ *val32 = 0;
+ } else if ((off >= APLIC_CLRIE_BASE) &&
+ (off < (APLIC_CLRIE_BASE + aplic->nr_words * 4))) {
+ *val32 = 0;
+ } else if (off == APLIC_CLRIENUM) {
+ *val32 = 0;
+ } else if (off == APLIC_SETIPNUM_LE) {
+ *val32 = 0;
+ } else if (off == APLIC_SETIPNUM_BE) {
+ *val32 = 0;
+ } else if (off == APLIC_GENMSI) {
+ *val32 = aplic->genmsi;
+ } else if ((off >= APLIC_TARGET_BASE) &&
+ (off < (APLIC_TARGET_BASE + (aplic->nr_irqs - 1) * 4))) {
+ i = ((off - APLIC_TARGET_BASE) >> 2) + 1;
+ *val32 = aplic_read_target(aplic, i);
+ } else
+ return -ENODEV;
+
+ return 0;
+}
+
+static int aplic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ if (len != 4)
+ return -EOPNOTSUPP;
+
+ return aplic_mmio_read_offset(vcpu->kvm,
+ addr - vcpu->kvm->arch.aia.aplic_addr,
+ val);
+}
+
+static int aplic_mmio_write_offset(struct kvm *kvm, gpa_t off, u32 val32)
+{
+ u32 i;
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if ((off & 0x3) != 0)
+ return -EOPNOTSUPP;
+
+ if (off == APLIC_DOMAINCFG) {
+ /* Only IE bit writeable */
+ aplic->domaincfg = val32 & APLIC_DOMAINCFG_IE;
+ } else if ((off >= APLIC_SOURCECFG_BASE) &&
+ (off < (APLIC_SOURCECFG_BASE + (aplic->nr_irqs - 1) * 4))) {
+ i = ((off - APLIC_SOURCECFG_BASE) >> 2) + 1;
+ aplic_write_sourcecfg(aplic, i, val32);
+ } else if ((off >= APLIC_SETIP_BASE) &&
+ (off < (APLIC_SETIP_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_SETIP_BASE) >> 2;
+ aplic_write_pending_word(aplic, i, val32, true);
+ } else if (off == APLIC_SETIPNUM) {
+ aplic_write_pending(aplic, val32, true);
+ } else if ((off >= APLIC_CLRIP_BASE) &&
+ (off < (APLIC_CLRIP_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_CLRIP_BASE) >> 2;
+ aplic_write_pending_word(aplic, i, val32, false);
+ } else if (off == APLIC_CLRIPNUM) {
+ aplic_write_pending(aplic, val32, false);
+ } else if ((off >= APLIC_SETIE_BASE) &&
+ (off < (APLIC_SETIE_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_SETIE_BASE) >> 2;
+ aplic_write_enabled_word(aplic, i, val32, true);
+ } else if (off == APLIC_SETIENUM) {
+ aplic_write_enabled(aplic, val32, true);
+ } else if ((off >= APLIC_CLRIE_BASE) &&
+ (off < (APLIC_CLRIE_BASE + aplic->nr_words * 4))) {
+ i = (off - APLIC_CLRIE_BASE) >> 2;
+ aplic_write_enabled_word(aplic, i, val32, false);
+ } else if (off == APLIC_CLRIENUM) {
+ aplic_write_enabled(aplic, val32, false);
+ } else if (off == APLIC_SETIPNUM_LE) {
+ aplic_write_pending(aplic, val32, true);
+ } else if (off == APLIC_SETIPNUM_BE) {
+ aplic_write_pending(aplic, __swab32(val32), true);
+ } else if (off == APLIC_GENMSI) {
+ aplic->genmsi = val32 & ~(APLIC_TARGET_GUEST_IDX_MASK <<
+ APLIC_TARGET_GUEST_IDX_SHIFT);
+ kvm_riscv_aia_inject_msi_by_id(kvm,
+ val32 >> APLIC_TARGET_HART_IDX_SHIFT, 0,
+ val32 & APLIC_TARGET_EIID_MASK);
+ } else if ((off >= APLIC_TARGET_BASE) &&
+ (off < (APLIC_TARGET_BASE + (aplic->nr_irqs - 1) * 4))) {
+ i = ((off - APLIC_TARGET_BASE) >> 2) + 1;
+ aplic_write_target(aplic, i, val32);
+ } else
+ return -ENODEV;
+
+ aplic_update_irq_range(kvm, 1, aplic->nr_irqs - 1);
+
+ return 0;
+}
+
+static int aplic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ if (len != 4)
+ return -EOPNOTSUPP;
+
+ return aplic_mmio_write_offset(vcpu->kvm,
+ addr - vcpu->kvm->arch.aia.aplic_addr,
+ *((const u32 *)val));
+}
+
+static struct kvm_io_device_ops aplic_iodoev_ops = {
+ .read = aplic_mmio_read,
+ .write = aplic_mmio_write,
+};
+
+int kvm_riscv_aia_aplic_set_attr(struct kvm *kvm, unsigned long type, u32 v)
+{
+ int rc;
+
+ if (!kvm->arch.aia.aplic_state)
+ return -ENODEV;
+
+ rc = aplic_mmio_write_offset(kvm, type, v);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
+int kvm_riscv_aia_aplic_get_attr(struct kvm *kvm, unsigned long type, u32 *v)
+{
+ int rc;
+
+ if (!kvm->arch.aia.aplic_state)
+ return -ENODEV;
+
+ rc = aplic_mmio_read_offset(kvm, type, v);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
+int kvm_riscv_aia_aplic_has_attr(struct kvm *kvm, unsigned long type)
+{
+ int rc;
+ u32 val;
+
+ if (!kvm->arch.aia.aplic_state)
+ return -ENODEV;
+
+ rc = aplic_mmio_read_offset(kvm, type, &val);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
+int kvm_riscv_aia_aplic_init(struct kvm *kvm)
+{
+ int i, ret = 0;
+ struct aplic *aplic;
+
+ /* Do nothing if we have zero sources */
+ if (!kvm->arch.aia.nr_sources)
+ return 0;
+
+ /* Allocate APLIC global state */
+ aplic = kzalloc(sizeof(*aplic), GFP_KERNEL);
+ if (!aplic)
+ return -ENOMEM;
+ kvm->arch.aia.aplic_state = aplic;
+
+ /* Setup APLIC IRQs */
+ aplic->nr_irqs = kvm->arch.aia.nr_sources + 1;
+ aplic->nr_words = DIV_ROUND_UP(aplic->nr_irqs, 32);
+ aplic->irqs = kcalloc(aplic->nr_irqs,
+ sizeof(*aplic->irqs), GFP_KERNEL);
+ if (!aplic->irqs) {
+ ret = -ENOMEM;
+ goto fail_free_aplic;
+ }
+ for (i = 0; i < aplic->nr_irqs; i++)
+ raw_spin_lock_init(&aplic->irqs[i].lock);
+
+ /* Setup IO device */
+ kvm_iodevice_init(&aplic->iodev, &aplic_iodoev_ops);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+ kvm->arch.aia.aplic_addr,
+ KVM_DEV_RISCV_APLIC_SIZE,
+ &aplic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret)
+ goto fail_free_aplic_irqs;
+
+ /* Setup default IRQ routing */
+ ret = kvm_riscv_setup_default_irq_routing(kvm, aplic->nr_irqs);
+ if (ret)
+ goto fail_unreg_iodev;
+
+ return 0;
+
+fail_unreg_iodev:
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &aplic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+fail_free_aplic_irqs:
+ kfree(aplic->irqs);
+fail_free_aplic:
+ kvm->arch.aia.aplic_state = NULL;
+ kfree(aplic);
+ return ret;
+}
+
+void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm)
+{
+ struct aplic *aplic = kvm->arch.aia.aplic_state;
+
+ if (!aplic)
+ return;
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &aplic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+
+ kfree(aplic->irqs);
+
+ kvm->arch.aia.aplic_state = NULL;
+ kfree(aplic);
+}
diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c
new file mode 100644
index 000000000000..0eb689351b7d
--- /dev/null
+++ b/arch/riscv/kvm/aia_device.c
@@ -0,0 +1,673 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ *
+ * Authors:
+ * Anup Patel <apatel@ventanamicro.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_aia_imsic.h>
+
+static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
+{
+ struct kvm_vcpu *tmp_vcpu;
+
+ for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+ tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+ mutex_unlock(&tmp_vcpu->mutex);
+ }
+}
+
+static void unlock_all_vcpus(struct kvm *kvm)
+{
+ unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
+}
+
+static bool lock_all_vcpus(struct kvm *kvm)
+{
+ struct kvm_vcpu *tmp_vcpu;
+ unsigned long c;
+
+ kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
+ if (!mutex_trylock(&tmp_vcpu->mutex)) {
+ unlock_vcpus(kvm, c - 1);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int aia_create(struct kvm_device *dev, u32 type)
+{
+ int ret;
+ unsigned long i;
+ struct kvm *kvm = dev->kvm;
+ struct kvm_vcpu *vcpu;
+
+ if (irqchip_in_kernel(kvm))
+ return -EEXIST;
+
+ ret = -EBUSY;
+ if (!lock_all_vcpus(kvm))
+ return ret;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->arch.ran_atleast_once)
+ goto out_unlock;
+ }
+ ret = 0;
+
+ kvm->arch.aia.in_kernel = true;
+
+out_unlock:
+ unlock_all_vcpus(kvm);
+ return ret;
+}
+
+static void aia_destroy(struct kvm_device *dev)
+{
+ kfree(dev);
+}
+
+static int aia_config(struct kvm *kvm, unsigned long type,
+ u32 *nr, bool write)
+{
+ struct kvm_aia *aia = &kvm->arch.aia;
+
+ /* Writes can only be done before irqchip is initialized */
+ if (write && kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ switch (type) {
+ case KVM_DEV_RISCV_AIA_CONFIG_MODE:
+ if (write) {
+ switch (*nr) {
+ case KVM_DEV_RISCV_AIA_MODE_EMUL:
+ break;
+ case KVM_DEV_RISCV_AIA_MODE_HWACCEL:
+ case KVM_DEV_RISCV_AIA_MODE_AUTO:
+ /*
+ * HW Acceleration and Auto modes only
+ * supported on host with non-zero guest
+ * external interrupts (i.e. non-zero
+ * VS-level IMSIC pages).
+ */
+ if (!kvm_riscv_aia_nr_hgei)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ aia->mode = *nr;
+ } else
+ *nr = aia->mode;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_IDS:
+ if (write) {
+ if ((*nr < KVM_DEV_RISCV_AIA_IDS_MIN) ||
+ (*nr >= KVM_DEV_RISCV_AIA_IDS_MAX) ||
+ ((*nr & KVM_DEV_RISCV_AIA_IDS_MIN) !=
+ KVM_DEV_RISCV_AIA_IDS_MIN) ||
+ (kvm_riscv_aia_max_ids <= *nr))
+ return -EINVAL;
+ aia->nr_ids = *nr;
+ } else
+ *nr = aia->nr_ids;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_SRCS:
+ if (write) {
+ if ((*nr >= KVM_DEV_RISCV_AIA_SRCS_MAX) ||
+ (*nr >= kvm_riscv_aia_max_ids))
+ return -EINVAL;
+ aia->nr_sources = *nr;
+ } else
+ *nr = aia->nr_sources;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS:
+ if (write) {
+ if (*nr >= KVM_DEV_RISCV_AIA_GROUP_BITS_MAX)
+ return -EINVAL;
+ aia->nr_group_bits = *nr;
+ } else
+ *nr = aia->nr_group_bits;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT:
+ if (write) {
+ if ((*nr < KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN) ||
+ (*nr >= KVM_DEV_RISCV_AIA_GROUP_SHIFT_MAX))
+ return -EINVAL;
+ aia->nr_group_shift = *nr;
+ } else
+ *nr = aia->nr_group_shift;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS:
+ if (write) {
+ if (*nr >= KVM_DEV_RISCV_AIA_HART_BITS_MAX)
+ return -EINVAL;
+ aia->nr_hart_bits = *nr;
+ } else
+ *nr = aia->nr_hart_bits;
+ break;
+ case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS:
+ if (write) {
+ if (*nr >= KVM_DEV_RISCV_AIA_GUEST_BITS_MAX)
+ return -EINVAL;
+ aia->nr_guest_bits = *nr;
+ } else
+ *nr = aia->nr_guest_bits;
+ break;
+ default:
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int aia_aplic_addr(struct kvm *kvm, u64 *addr, bool write)
+{
+ struct kvm_aia *aia = &kvm->arch.aia;
+
+ if (write) {
+ /* Writes can only be done before irqchip is initialized */
+ if (kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ if (*addr & (KVM_DEV_RISCV_APLIC_ALIGN - 1))
+ return -EINVAL;
+
+ aia->aplic_addr = *addr;
+ } else
+ *addr = aia->aplic_addr;
+
+ return 0;
+}
+
+static int aia_imsic_addr(struct kvm *kvm, u64 *addr,
+ unsigned long vcpu_idx, bool write)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_aia *vcpu_aia;
+
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ if (!vcpu)
+ return -EINVAL;
+ vcpu_aia = &vcpu->arch.aia_context;
+
+ if (write) {
+ /* Writes can only be done before irqchip is initialized */
+ if (kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ if (*addr & (KVM_DEV_RISCV_IMSIC_ALIGN - 1))
+ return -EINVAL;
+ }
+
+ mutex_lock(&vcpu->mutex);
+ if (write)
+ vcpu_aia->imsic_addr = *addr;
+ else
+ *addr = vcpu_aia->imsic_addr;
+ mutex_unlock(&vcpu->mutex);
+
+ return 0;
+}
+
+static gpa_t aia_imsic_ppn(struct kvm_aia *aia, gpa_t addr)
+{
+ u32 h, l;
+ gpa_t mask = 0;
+
+ h = aia->nr_hart_bits + aia->nr_guest_bits +
+ IMSIC_MMIO_PAGE_SHIFT - 1;
+ mask = GENMASK_ULL(h, 0);
+
+ if (aia->nr_group_bits) {
+ h = aia->nr_group_bits + aia->nr_group_shift - 1;
+ l = aia->nr_group_shift;
+ mask |= GENMASK_ULL(h, l);
+ }
+
+ return (addr & ~mask) >> IMSIC_MMIO_PAGE_SHIFT;
+}
+
+static u32 aia_imsic_hart_index(struct kvm_aia *aia, gpa_t addr)
+{
+ u32 hart, group = 0;
+
+ hart = (addr >> (aia->nr_guest_bits + IMSIC_MMIO_PAGE_SHIFT)) &
+ GENMASK_ULL(aia->nr_hart_bits - 1, 0);
+ if (aia->nr_group_bits)
+ group = (addr >> aia->nr_group_shift) &
+ GENMASK_ULL(aia->nr_group_bits - 1, 0);
+
+ return (group << aia->nr_hart_bits) | hart;
+}
+
+static int aia_init(struct kvm *kvm)
+{
+ int ret, i;
+ unsigned long idx;
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_aia *vaia;
+ struct kvm_aia *aia = &kvm->arch.aia;
+ gpa_t base_ppn = KVM_RISCV_AIA_UNDEF_ADDR;
+
+ /* Irqchip can be initialized only once */
+ if (kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ /* We might be in the middle of creating a VCPU? */
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
+ return -EBUSY;
+
+ /* Number of sources should be less than or equals number of IDs */
+ if (aia->nr_ids < aia->nr_sources)
+ return -EINVAL;
+
+ /* APLIC base is required for non-zero number of sources */
+ if (aia->nr_sources && aia->aplic_addr == KVM_RISCV_AIA_UNDEF_ADDR)
+ return -EINVAL;
+
+ /* Initialize APLIC */
+ ret = kvm_riscv_aia_aplic_init(kvm);
+ if (ret)
+ return ret;
+
+ /* Iterate over each VCPU */
+ kvm_for_each_vcpu(idx, vcpu, kvm) {
+ vaia = &vcpu->arch.aia_context;
+
+ /* IMSIC base is required */
+ if (vaia->imsic_addr == KVM_RISCV_AIA_UNDEF_ADDR) {
+ ret = -EINVAL;
+ goto fail_cleanup_imsics;
+ }
+
+ /* All IMSICs should have matching base PPN */
+ if (base_ppn == KVM_RISCV_AIA_UNDEF_ADDR)
+ base_ppn = aia_imsic_ppn(aia, vaia->imsic_addr);
+ if (base_ppn != aia_imsic_ppn(aia, vaia->imsic_addr)) {
+ ret = -EINVAL;
+ goto fail_cleanup_imsics;
+ }
+
+ /* Update HART index of the IMSIC based on IMSIC base */
+ vaia->hart_index = aia_imsic_hart_index(aia,
+ vaia->imsic_addr);
+
+ /* Initialize IMSIC for this VCPU */
+ ret = kvm_riscv_vcpu_aia_imsic_init(vcpu);
+ if (ret)
+ goto fail_cleanup_imsics;
+ }
+
+ /* Set the initialized flag */
+ kvm->arch.aia.initialized = true;
+
+ return 0;
+
+fail_cleanup_imsics:
+ for (i = idx - 1; i >= 0; i--) {
+ vcpu = kvm_get_vcpu(kvm, i);
+ if (!vcpu)
+ continue;
+ kvm_riscv_vcpu_aia_imsic_cleanup(vcpu);
+ }
+ kvm_riscv_aia_aplic_cleanup(kvm);
+ return ret;
+}
+
+static int aia_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ u32 nr;
+ u64 addr;
+ int nr_vcpus, r = -ENXIO;
+ unsigned long v, type = (unsigned long)attr->attr;
+ void __user *uaddr = (void __user *)(long)attr->addr;
+
+ switch (attr->group) {
+ case KVM_DEV_RISCV_AIA_GRP_CONFIG:
+ if (copy_from_user(&nr, uaddr, sizeof(nr)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = aia_config(dev->kvm, type, &nr, true);
+ mutex_unlock(&dev->kvm->lock);
+
+ break;
+
+ case KVM_DEV_RISCV_AIA_GRP_ADDR:
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
+ nr_vcpus = atomic_read(&dev->kvm->online_vcpus);
+ mutex_lock(&dev->kvm->lock);
+ if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC)
+ r = aia_aplic_addr(dev->kvm, &addr, true);
+ else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus))
+ r = aia_imsic_addr(dev->kvm, &addr,
+ type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), true);
+ mutex_unlock(&dev->kvm->lock);
+
+ break;
+
+ case KVM_DEV_RISCV_AIA_GRP_CTRL:
+ switch (type) {
+ case KVM_DEV_RISCV_AIA_CTRL_INIT:
+ mutex_lock(&dev->kvm->lock);
+ r = aia_init(dev->kvm);
+ mutex_unlock(&dev->kvm->lock);
+ break;
+ }
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_APLIC:
+ if (copy_from_user(&nr, uaddr, sizeof(nr)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = kvm_riscv_aia_aplic_set_attr(dev->kvm, type, nr);
+ mutex_unlock(&dev->kvm->lock);
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_IMSIC:
+ if (copy_from_user(&v, uaddr, sizeof(v)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = kvm_riscv_aia_imsic_rw_attr(dev->kvm, type, true, &v);
+ mutex_unlock(&dev->kvm->lock);
+
+ break;
+ }
+
+ return r;
+}
+
+static int aia_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ u32 nr;
+ u64 addr;
+ int nr_vcpus, r = -ENXIO;
+ void __user *uaddr = (void __user *)(long)attr->addr;
+ unsigned long v, type = (unsigned long)attr->attr;
+
+ switch (attr->group) {
+ case KVM_DEV_RISCV_AIA_GRP_CONFIG:
+ if (copy_from_user(&nr, uaddr, sizeof(nr)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = aia_config(dev->kvm, type, &nr, false);
+ mutex_unlock(&dev->kvm->lock);
+ if (r)
+ return r;
+
+ if (copy_to_user(uaddr, &nr, sizeof(nr)))
+ return -EFAULT;
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_ADDR:
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
+ nr_vcpus = atomic_read(&dev->kvm->online_vcpus);
+ mutex_lock(&dev->kvm->lock);
+ if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC)
+ r = aia_aplic_addr(dev->kvm, &addr, false);
+ else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus))
+ r = aia_imsic_addr(dev->kvm, &addr,
+ type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), false);
+ mutex_unlock(&dev->kvm->lock);
+ if (r)
+ return r;
+
+ if (copy_to_user(uaddr, &addr, sizeof(addr)))
+ return -EFAULT;
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_APLIC:
+ if (copy_from_user(&nr, uaddr, sizeof(nr)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = kvm_riscv_aia_aplic_get_attr(dev->kvm, type, &nr);
+ mutex_unlock(&dev->kvm->lock);
+ if (r)
+ return r;
+
+ if (copy_to_user(uaddr, &nr, sizeof(nr)))
+ return -EFAULT;
+
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_IMSIC:
+ if (copy_from_user(&v, uaddr, sizeof(v)))
+ return -EFAULT;
+
+ mutex_lock(&dev->kvm->lock);
+ r = kvm_riscv_aia_imsic_rw_attr(dev->kvm, type, false, &v);
+ mutex_unlock(&dev->kvm->lock);
+ if (r)
+ return r;
+
+ if (copy_to_user(uaddr, &v, sizeof(v)))
+ return -EFAULT;
+
+ break;
+ }
+
+ return r;
+}
+
+static int aia_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ int nr_vcpus;
+
+ switch (attr->group) {
+ case KVM_DEV_RISCV_AIA_GRP_CONFIG:
+ switch (attr->attr) {
+ case KVM_DEV_RISCV_AIA_CONFIG_MODE:
+ case KVM_DEV_RISCV_AIA_CONFIG_IDS:
+ case KVM_DEV_RISCV_AIA_CONFIG_SRCS:
+ case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS:
+ case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT:
+ case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS:
+ case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS:
+ return 0;
+ }
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_ADDR:
+ nr_vcpus = atomic_read(&dev->kvm->online_vcpus);
+ if (attr->attr == KVM_DEV_RISCV_AIA_ADDR_APLIC)
+ return 0;
+ else if (attr->attr < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus))
+ return 0;
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_RISCV_AIA_CTRL_INIT:
+ return 0;
+ }
+ break;
+ case KVM_DEV_RISCV_AIA_GRP_APLIC:
+ return kvm_riscv_aia_aplic_has_attr(dev->kvm, attr->attr);
+ case KVM_DEV_RISCV_AIA_GRP_IMSIC:
+ return kvm_riscv_aia_imsic_has_attr(dev->kvm, attr->attr);
+ }
+
+ return -ENXIO;
+}
+
+struct kvm_device_ops kvm_riscv_aia_device_ops = {
+ .name = "kvm-riscv-aia",
+ .create = aia_create,
+ .destroy = aia_destroy,
+ .set_attr = aia_set_attr,
+ .get_attr = aia_get_attr,
+ .has_attr = aia_has_attr,
+};
+
+int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu)
+{
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(vcpu->kvm))
+ return 1;
+
+ /* Update the IMSIC HW state before entering guest mode */
+ return kvm_riscv_vcpu_aia_imsic_update(vcpu);
+}
+
+void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr;
+ struct kvm_vcpu_aia_csr *reset_csr =
+ &vcpu->arch.aia_context.guest_reset_csr;
+
+ if (!kvm_riscv_aia_available())
+ return;
+ memcpy(csr, reset_csr, sizeof(*csr));
+
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(vcpu->kvm))
+ return;
+
+ /* Reset the IMSIC context */
+ kvm_riscv_vcpu_aia_imsic_reset(vcpu);
+}
+
+int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context;
+
+ if (!kvm_riscv_aia_available())
+ return 0;
+
+ /*
+ * We don't do any memory allocations over here because these
+ * will be done after AIA device is initialized by the user-space.
+ *
+ * Refer, aia_init() implementation for more details.
+ */
+
+ /* Initialize default values in AIA vcpu context */
+ vaia->imsic_addr = KVM_RISCV_AIA_UNDEF_ADDR;
+ vaia->hart_index = vcpu->vcpu_idx;
+
+ return 0;
+}
+
+void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu)
+{
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(vcpu->kvm))
+ return;
+
+ /* Cleanup IMSIC context */
+ kvm_riscv_vcpu_aia_imsic_cleanup(vcpu);
+}
+
+int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index,
+ u32 guest_index, u32 iid)
+{
+ unsigned long idx;
+ struct kvm_vcpu *vcpu;
+
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ /* Inject MSI to matching VCPU */
+ kvm_for_each_vcpu(idx, vcpu, kvm) {
+ if (vcpu->arch.aia_context.hart_index == hart_index)
+ return kvm_riscv_vcpu_aia_imsic_inject(vcpu,
+ guest_index,
+ 0, iid);
+ }
+
+ return 0;
+}
+
+int kvm_riscv_aia_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ gpa_t tppn, ippn;
+ unsigned long idx;
+ struct kvm_vcpu *vcpu;
+ u32 g, toff, iid = msi->data;
+ struct kvm_aia *aia = &kvm->arch.aia;
+ gpa_t target = (((gpa_t)msi->address_hi) << 32) | msi->address_lo;
+
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ /* Convert target address to target PPN */
+ tppn = target >> IMSIC_MMIO_PAGE_SHIFT;
+
+ /* Extract and clear Guest ID from target PPN */
+ g = tppn & (BIT(aia->nr_guest_bits) - 1);
+ tppn &= ~((gpa_t)(BIT(aia->nr_guest_bits) - 1));
+
+ /* Inject MSI to matching VCPU */
+ kvm_for_each_vcpu(idx, vcpu, kvm) {
+ ippn = vcpu->arch.aia_context.imsic_addr >>
+ IMSIC_MMIO_PAGE_SHIFT;
+ if (ippn == tppn) {
+ toff = target & (IMSIC_MMIO_PAGE_SZ - 1);
+ return kvm_riscv_vcpu_aia_imsic_inject(vcpu, g,
+ toff, iid);
+ }
+ }
+
+ return 0;
+}
+
+int kvm_riscv_aia_inject_irq(struct kvm *kvm, unsigned int irq, bool level)
+{
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -EBUSY;
+
+ /* Inject interrupt level change in APLIC */
+ return kvm_riscv_aia_aplic_inject(kvm, irq, level);
+}
+
+void kvm_riscv_aia_init_vm(struct kvm *kvm)
+{
+ struct kvm_aia *aia = &kvm->arch.aia;
+
+ if (!kvm_riscv_aia_available())
+ return;
+
+ /*
+ * We don't do any memory allocations over here because these
+ * will be done after AIA device is initialized by the user-space.
+ *
+ * Refer, aia_init() implementation for more details.
+ */
+
+ /* Initialize default values in AIA global context */
+ aia->mode = (kvm_riscv_aia_nr_hgei) ?
+ KVM_DEV_RISCV_AIA_MODE_AUTO : KVM_DEV_RISCV_AIA_MODE_EMUL;
+ aia->nr_ids = kvm_riscv_aia_max_ids - 1;
+ aia->nr_sources = 0;
+ aia->nr_group_bits = 0;
+ aia->nr_group_shift = KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN;
+ aia->nr_hart_bits = 0;
+ aia->nr_guest_bits = 0;
+ aia->aplic_addr = KVM_RISCV_AIA_UNDEF_ADDR;
+}
+
+void kvm_riscv_aia_destroy_vm(struct kvm *kvm)
+{
+ /* Proceed only if AIA was initialized successfully */
+ if (!kvm_riscv_aia_initialized(kvm))
+ return;
+
+ /* Cleanup APLIC context */
+ kvm_riscv_aia_aplic_cleanup(kvm);
+}
diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c
new file mode 100644
index 000000000000..6cf23b8adb71
--- /dev/null
+++ b/arch/riscv/kvm/aia_imsic.c
@@ -0,0 +1,1084 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ *
+ * Authors:
+ * Anup Patel <apatel@ventanamicro.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/kvm_host.h>
+#include <linux/math.h>
+#include <linux/spinlock.h>
+#include <linux/swab.h>
+#include <kvm/iodev.h>
+#include <asm/csr.h>
+#include <asm/kvm_aia_imsic.h>
+
+#define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64))
+
+struct imsic_mrif_eix {
+ unsigned long eip[BITS_PER_TYPE(u64) / BITS_PER_LONG];
+ unsigned long eie[BITS_PER_TYPE(u64) / BITS_PER_LONG];
+};
+
+struct imsic_mrif {
+ struct imsic_mrif_eix eix[IMSIC_MAX_EIX];
+ unsigned long eithreshold;
+ unsigned long eidelivery;
+};
+
+struct imsic {
+ struct kvm_io_device iodev;
+
+ u32 nr_msis;
+ u32 nr_eix;
+ u32 nr_hw_eix;
+
+ /*
+ * At any point in time, the register state is in
+ * one of the following places:
+ *
+ * 1) Hardware: IMSIC VS-file (vsfile_cpu >= 0)
+ * 2) Software: IMSIC SW-file (vsfile_cpu < 0)
+ */
+
+ /* IMSIC VS-file */
+ rwlock_t vsfile_lock;
+ int vsfile_cpu;
+ int vsfile_hgei;
+ void __iomem *vsfile_va;
+ phys_addr_t vsfile_pa;
+
+ /* IMSIC SW-file */
+ struct imsic_mrif *swfile;
+ phys_addr_t swfile_pa;
+};
+
+#define imsic_vs_csr_read(__c) \
+({ \
+ unsigned long __r; \
+ csr_write(CSR_VSISELECT, __c); \
+ __r = csr_read(CSR_VSIREG); \
+ __r; \
+})
+
+#define imsic_read_switchcase(__ireg) \
+ case __ireg: \
+ return imsic_vs_csr_read(__ireg);
+#define imsic_read_switchcase_2(__ireg) \
+ imsic_read_switchcase(__ireg + 0) \
+ imsic_read_switchcase(__ireg + 1)
+#define imsic_read_switchcase_4(__ireg) \
+ imsic_read_switchcase_2(__ireg + 0) \
+ imsic_read_switchcase_2(__ireg + 2)
+#define imsic_read_switchcase_8(__ireg) \
+ imsic_read_switchcase_4(__ireg + 0) \
+ imsic_read_switchcase_4(__ireg + 4)
+#define imsic_read_switchcase_16(__ireg) \
+ imsic_read_switchcase_8(__ireg + 0) \
+ imsic_read_switchcase_8(__ireg + 8)
+#define imsic_read_switchcase_32(__ireg) \
+ imsic_read_switchcase_16(__ireg + 0) \
+ imsic_read_switchcase_16(__ireg + 16)
+#define imsic_read_switchcase_64(__ireg) \
+ imsic_read_switchcase_32(__ireg + 0) \
+ imsic_read_switchcase_32(__ireg + 32)
+
+static unsigned long imsic_eix_read(int ireg)
+{
+ switch (ireg) {
+ imsic_read_switchcase_64(IMSIC_EIP0)
+ imsic_read_switchcase_64(IMSIC_EIE0)
+ }
+
+ return 0;
+}
+
+#define imsic_vs_csr_swap(__c, __v) \
+({ \
+ unsigned long __r; \
+ csr_write(CSR_VSISELECT, __c); \
+ __r = csr_swap(CSR_VSIREG, __v); \
+ __r; \
+})
+
+#define imsic_swap_switchcase(__ireg, __v) \
+ case __ireg: \
+ return imsic_vs_csr_swap(__ireg, __v);
+#define imsic_swap_switchcase_2(__ireg, __v) \
+ imsic_swap_switchcase(__ireg + 0, __v) \
+ imsic_swap_switchcase(__ireg + 1, __v)
+#define imsic_swap_switchcase_4(__ireg, __v) \
+ imsic_swap_switchcase_2(__ireg + 0, __v) \
+ imsic_swap_switchcase_2(__ireg + 2, __v)
+#define imsic_swap_switchcase_8(__ireg, __v) \
+ imsic_swap_switchcase_4(__ireg + 0, __v) \
+ imsic_swap_switchcase_4(__ireg + 4, __v)
+#define imsic_swap_switchcase_16(__ireg, __v) \
+ imsic_swap_switchcase_8(__ireg + 0, __v) \
+ imsic_swap_switchcase_8(__ireg + 8, __v)
+#define imsic_swap_switchcase_32(__ireg, __v) \
+ imsic_swap_switchcase_16(__ireg + 0, __v) \
+ imsic_swap_switchcase_16(__ireg + 16, __v)
+#define imsic_swap_switchcase_64(__ireg, __v) \
+ imsic_swap_switchcase_32(__ireg + 0, __v) \
+ imsic_swap_switchcase_32(__ireg + 32, __v)
+
+static unsigned long imsic_eix_swap(int ireg, unsigned long val)
+{
+ switch (ireg) {
+ imsic_swap_switchcase_64(IMSIC_EIP0, val)
+ imsic_swap_switchcase_64(IMSIC_EIE0, val)
+ }
+
+ return 0;
+}
+
+#define imsic_vs_csr_write(__c, __v) \
+do { \
+ csr_write(CSR_VSISELECT, __c); \
+ csr_write(CSR_VSIREG, __v); \
+} while (0)
+
+#define imsic_write_switchcase(__ireg, __v) \
+ case __ireg: \
+ imsic_vs_csr_write(__ireg, __v); \
+ break;
+#define imsic_write_switchcase_2(__ireg, __v) \
+ imsic_write_switchcase(__ireg + 0, __v) \
+ imsic_write_switchcase(__ireg + 1, __v)
+#define imsic_write_switchcase_4(__ireg, __v) \
+ imsic_write_switchcase_2(__ireg + 0, __v) \
+ imsic_write_switchcase_2(__ireg + 2, __v)
+#define imsic_write_switchcase_8(__ireg, __v) \
+ imsic_write_switchcase_4(__ireg + 0, __v) \
+ imsic_write_switchcase_4(__ireg + 4, __v)
+#define imsic_write_switchcase_16(__ireg, __v) \
+ imsic_write_switchcase_8(__ireg + 0, __v) \
+ imsic_write_switchcase_8(__ireg + 8, __v)
+#define imsic_write_switchcase_32(__ireg, __v) \
+ imsic_write_switchcase_16(__ireg + 0, __v) \
+ imsic_write_switchcase_16(__ireg + 16, __v)
+#define imsic_write_switchcase_64(__ireg, __v) \
+ imsic_write_switchcase_32(__ireg + 0, __v) \
+ imsic_write_switchcase_32(__ireg + 32, __v)
+
+static void imsic_eix_write(int ireg, unsigned long val)
+{
+ switch (ireg) {
+ imsic_write_switchcase_64(IMSIC_EIP0, val)
+ imsic_write_switchcase_64(IMSIC_EIE0, val)
+ }
+}
+
+#define imsic_vs_csr_set(__c, __v) \
+do { \
+ csr_write(CSR_VSISELECT, __c); \
+ csr_set(CSR_VSIREG, __v); \
+} while (0)
+
+#define imsic_set_switchcase(__ireg, __v) \
+ case __ireg: \
+ imsic_vs_csr_set(__ireg, __v); \
+ break;
+#define imsic_set_switchcase_2(__ireg, __v) \
+ imsic_set_switchcase(__ireg + 0, __v) \
+ imsic_set_switchcase(__ireg + 1, __v)
+#define imsic_set_switchcase_4(__ireg, __v) \
+ imsic_set_switchcase_2(__ireg + 0, __v) \
+ imsic_set_switchcase_2(__ireg + 2, __v)
+#define imsic_set_switchcase_8(__ireg, __v) \
+ imsic_set_switchcase_4(__ireg + 0, __v) \
+ imsic_set_switchcase_4(__ireg + 4, __v)
+#define imsic_set_switchcase_16(__ireg, __v) \
+ imsic_set_switchcase_8(__ireg + 0, __v) \
+ imsic_set_switchcase_8(__ireg + 8, __v)
+#define imsic_set_switchcase_32(__ireg, __v) \
+ imsic_set_switchcase_16(__ireg + 0, __v) \
+ imsic_set_switchcase_16(__ireg + 16, __v)
+#define imsic_set_switchcase_64(__ireg, __v) \
+ imsic_set_switchcase_32(__ireg + 0, __v) \
+ imsic_set_switchcase_32(__ireg + 32, __v)
+
+static void imsic_eix_set(int ireg, unsigned long val)
+{
+ switch (ireg) {
+ imsic_set_switchcase_64(IMSIC_EIP0, val)
+ imsic_set_switchcase_64(IMSIC_EIE0, val)
+ }
+}
+
+static unsigned long imsic_mrif_atomic_rmw(struct imsic_mrif *mrif,
+ unsigned long *ptr,
+ unsigned long new_val,
+ unsigned long wr_mask)
+{
+ unsigned long old_val = 0, tmp = 0;
+
+ __asm__ __volatile__ (
+ "0: lr.w.aq %1, %0\n"
+ " and %2, %1, %3\n"
+ " or %2, %2, %4\n"
+ " sc.w.rl %2, %2, %0\n"
+ " bnez %2, 0b"
+ : "+A" (*ptr), "+r" (old_val), "+r" (tmp)
+ : "r" (~wr_mask), "r" (new_val & wr_mask)
+ : "memory");
+
+ return old_val;
+}
+
+static unsigned long imsic_mrif_atomic_or(struct imsic_mrif *mrif,
+ unsigned long *ptr,
+ unsigned long val)
+{
+ return atomic_long_fetch_or(val, (atomic_long_t *)ptr);
+}
+
+#define imsic_mrif_atomic_write(__mrif, __ptr, __new_val) \
+ imsic_mrif_atomic_rmw(__mrif, __ptr, __new_val, -1UL)
+#define imsic_mrif_atomic_read(__mrif, __ptr) \
+ imsic_mrif_atomic_or(__mrif, __ptr, 0)
+
+static u32 imsic_mrif_topei(struct imsic_mrif *mrif, u32 nr_eix, u32 nr_msis)
+{
+ struct imsic_mrif_eix *eix;
+ u32 i, imin, imax, ei, max_msi;
+ unsigned long eipend[BITS_PER_TYPE(u64) / BITS_PER_LONG];
+ unsigned long eithreshold = imsic_mrif_atomic_read(mrif,
+ &mrif->eithreshold);
+
+ max_msi = (eithreshold && (eithreshold <= nr_msis)) ?
+ eithreshold : nr_msis;
+ for (ei = 0; ei < nr_eix; ei++) {
+ eix = &mrif->eix[ei];
+ eipend[0] = imsic_mrif_atomic_read(mrif, &eix->eie[0]) &
+ imsic_mrif_atomic_read(mrif, &eix->eip[0]);
+#ifdef CONFIG_32BIT
+ eipend[1] = imsic_mrif_atomic_read(mrif, &eix->eie[1]) &
+ imsic_mrif_atomic_read(mrif, &eix->eip[1]);
+ if (!eipend[0] && !eipend[1])
+#else
+ if (!eipend[0])
+#endif
+ continue;
+
+ imin = ei * BITS_PER_TYPE(u64);
+ imax = ((imin + BITS_PER_TYPE(u64)) < max_msi) ?
+ imin + BITS_PER_TYPE(u64) : max_msi;
+ for (i = (!imin) ? 1 : imin; i < imax; i++) {
+ if (test_bit(i - imin, eipend))
+ return (i << TOPEI_ID_SHIFT) | i;
+ }
+ }
+
+ return 0;
+}
+
+static int imsic_mrif_isel_check(u32 nr_eix, unsigned long isel)
+{
+ u32 num = 0;
+
+ switch (isel) {
+ case IMSIC_EIDELIVERY:
+ case IMSIC_EITHRESHOLD:
+ break;
+ case IMSIC_EIP0 ... IMSIC_EIP63:
+ num = isel - IMSIC_EIP0;
+ break;
+ case IMSIC_EIE0 ... IMSIC_EIE63:
+ num = isel - IMSIC_EIE0;
+ break;
+ default:
+ return -ENOENT;
+ }
+#ifndef CONFIG_32BIT
+ if (num & 0x1)
+ return -EINVAL;
+#endif
+ if ((num / 2) >= nr_eix)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int imsic_mrif_rmw(struct imsic_mrif *mrif, u32 nr_eix,
+ unsigned long isel, unsigned long *val,
+ unsigned long new_val, unsigned long wr_mask)
+{
+ bool pend;
+ struct imsic_mrif_eix *eix;
+ unsigned long *ei, num, old_val = 0;
+
+ switch (isel) {
+ case IMSIC_EIDELIVERY:
+ old_val = imsic_mrif_atomic_rmw(mrif, &mrif->eidelivery,
+ new_val, wr_mask & 0x1);
+ break;
+ case IMSIC_EITHRESHOLD:
+ old_val = imsic_mrif_atomic_rmw(mrif, &mrif->eithreshold,
+ new_val, wr_mask & (IMSIC_MAX_ID - 1));
+ break;
+ case IMSIC_EIP0 ... IMSIC_EIP63:
+ case IMSIC_EIE0 ... IMSIC_EIE63:
+ if (isel >= IMSIC_EIP0 && isel <= IMSIC_EIP63) {
+ pend = true;
+ num = isel - IMSIC_EIP0;
+ } else {
+ pend = false;
+ num = isel - IMSIC_EIE0;
+ }
+
+ if ((num / 2) >= nr_eix)
+ return -EINVAL;
+ eix = &mrif->eix[num / 2];
+
+#ifndef CONFIG_32BIT
+ if (num & 0x1)
+ return -EINVAL;
+ ei = (pend) ? &eix->eip[0] : &eix->eie[0];
+#else
+ ei = (pend) ? &eix->eip[num & 0x1] : &eix->eie[num & 0x1];
+#endif
+
+ /* Bit0 of EIP0 or EIE0 is read-only */
+ if (!num)
+ wr_mask &= ~BIT(0);
+
+ old_val = imsic_mrif_atomic_rmw(mrif, ei, new_val, wr_mask);
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ if (val)
+ *val = old_val;
+
+ return 0;
+}
+
+struct imsic_vsfile_read_data {
+ int hgei;
+ u32 nr_eix;
+ bool clear;
+ struct imsic_mrif *mrif;
+};
+
+static void imsic_vsfile_local_read(void *data)
+{
+ u32 i;
+ struct imsic_mrif_eix *eix;
+ struct imsic_vsfile_read_data *idata = data;
+ struct imsic_mrif *mrif = idata->mrif;
+ unsigned long new_hstatus, old_hstatus, old_vsiselect;
+
+ old_vsiselect = csr_read(CSR_VSISELECT);
+ old_hstatus = csr_read(CSR_HSTATUS);
+ new_hstatus = old_hstatus & ~HSTATUS_VGEIN;
+ new_hstatus |= ((unsigned long)idata->hgei) << HSTATUS_VGEIN_SHIFT;
+ csr_write(CSR_HSTATUS, new_hstatus);
+
+ /*
+ * We don't use imsic_mrif_atomic_xyz() functions to store
+ * values in MRIF because imsic_vsfile_read() is always called
+ * with pointer to temporary MRIF on stack.
+ */
+
+ if (idata->clear) {
+ mrif->eidelivery = imsic_vs_csr_swap(IMSIC_EIDELIVERY, 0);
+ mrif->eithreshold = imsic_vs_csr_swap(IMSIC_EITHRESHOLD, 0);
+ for (i = 0; i < idata->nr_eix; i++) {
+ eix = &mrif->eix[i];
+ eix->eip[0] = imsic_eix_swap(IMSIC_EIP0 + i * 2, 0);
+ eix->eie[0] = imsic_eix_swap(IMSIC_EIE0 + i * 2, 0);
+#ifdef CONFIG_32BIT
+ eix->eip[1] = imsic_eix_swap(IMSIC_EIP0 + i * 2 + 1, 0);
+ eix->eie[1] = imsic_eix_swap(IMSIC_EIE0 + i * 2 + 1, 0);
+#endif
+ }
+ } else {
+ mrif->eidelivery = imsic_vs_csr_read(IMSIC_EIDELIVERY);
+ mrif->eithreshold = imsic_vs_csr_read(IMSIC_EITHRESHOLD);
+ for (i = 0; i < idata->nr_eix; i++) {
+ eix = &mrif->eix[i];
+ eix->eip[0] = imsic_eix_read(IMSIC_EIP0 + i * 2);
+ eix->eie[0] = imsic_eix_read(IMSIC_EIE0 + i * 2);
+#ifdef CONFIG_32BIT
+ eix->eip[1] = imsic_eix_read(IMSIC_EIP0 + i * 2 + 1);
+ eix->eie[1] = imsic_eix_read(IMSIC_EIE0 + i * 2 + 1);
+#endif
+ }
+ }
+
+ csr_write(CSR_HSTATUS, old_hstatus);
+ csr_write(CSR_VSISELECT, old_vsiselect);
+}
+
+static void imsic_vsfile_read(int vsfile_hgei, int vsfile_cpu, u32 nr_eix,
+ bool clear, struct imsic_mrif *mrif)
+{
+ struct imsic_vsfile_read_data idata;
+
+ /* We can only read clear if we have a IMSIC VS-file */
+ if (vsfile_cpu < 0 || vsfile_hgei <= 0)
+ return;
+
+ /* We can only read clear on local CPU */
+ idata.hgei = vsfile_hgei;
+ idata.nr_eix = nr_eix;
+ idata.clear = clear;
+ idata.mrif = mrif;
+ on_each_cpu_mask(cpumask_of(vsfile_cpu),
+ imsic_vsfile_local_read, &idata, 1);
+}
+
+struct imsic_vsfile_rw_data {
+ int hgei;
+ int isel;
+ bool write;
+ unsigned long val;
+};
+
+static void imsic_vsfile_local_rw(void *data)
+{
+ struct imsic_vsfile_rw_data *idata = data;
+ unsigned long new_hstatus, old_hstatus, old_vsiselect;
+
+ old_vsiselect = csr_read(CSR_VSISELECT);
+ old_hstatus = csr_read(CSR_HSTATUS);
+ new_hstatus = old_hstatus & ~HSTATUS_VGEIN;
+ new_hstatus |= ((unsigned long)idata->hgei) << HSTATUS_VGEIN_SHIFT;
+ csr_write(CSR_HSTATUS, new_hstatus);
+
+ switch (idata->isel) {
+ case IMSIC_EIDELIVERY:
+ if (idata->write)
+ imsic_vs_csr_write(IMSIC_EIDELIVERY, idata->val);
+ else
+ idata->val = imsic_vs_csr_read(IMSIC_EIDELIVERY);
+ break;
+ case IMSIC_EITHRESHOLD:
+ if (idata->write)
+ imsic_vs_csr_write(IMSIC_EITHRESHOLD, idata->val);
+ else
+ idata->val = imsic_vs_csr_read(IMSIC_EITHRESHOLD);
+ break;
+ case IMSIC_EIP0 ... IMSIC_EIP63:
+ case IMSIC_EIE0 ... IMSIC_EIE63:
+#ifndef CONFIG_32BIT
+ if (idata->isel & 0x1)
+ break;
+#endif
+ if (idata->write)
+ imsic_eix_write(idata->isel, idata->val);
+ else
+ idata->val = imsic_eix_read(idata->isel);
+ break;
+ default:
+ break;
+ }
+
+ csr_write(CSR_HSTATUS, old_hstatus);
+ csr_write(CSR_VSISELECT, old_vsiselect);
+}
+
+static int imsic_vsfile_rw(int vsfile_hgei, int vsfile_cpu, u32 nr_eix,
+ unsigned long isel, bool write,
+ unsigned long *val)
+{
+ int rc;
+ struct imsic_vsfile_rw_data rdata;
+
+ /* We can only access register if we have a IMSIC VS-file */
+ if (vsfile_cpu < 0 || vsfile_hgei <= 0)
+ return -EINVAL;
+
+ /* Check IMSIC register iselect */
+ rc = imsic_mrif_isel_check(nr_eix, isel);
+ if (rc)
+ return rc;
+
+ /* We can only access register on local CPU */
+ rdata.hgei = vsfile_hgei;
+ rdata.isel = isel;
+ rdata.write = write;
+ rdata.val = (write) ? *val : 0;
+ on_each_cpu_mask(cpumask_of(vsfile_cpu),
+ imsic_vsfile_local_rw, &rdata, 1);
+
+ if (!write)
+ *val = rdata.val;
+
+ return 0;
+}
+
+static void imsic_vsfile_local_clear(int vsfile_hgei, u32 nr_eix)
+{
+ u32 i;
+ unsigned long new_hstatus, old_hstatus, old_vsiselect;
+
+ /* We can only zero-out if we have a IMSIC VS-file */
+ if (vsfile_hgei <= 0)
+ return;
+
+ old_vsiselect = csr_read(CSR_VSISELECT);
+ old_hstatus = csr_read(CSR_HSTATUS);
+ new_hstatus = old_hstatus & ~HSTATUS_VGEIN;
+ new_hstatus |= ((unsigned long)vsfile_hgei) << HSTATUS_VGEIN_SHIFT;
+ csr_write(CSR_HSTATUS, new_hstatus);
+
+ imsic_vs_csr_write(IMSIC_EIDELIVERY, 0);
+ imsic_vs_csr_write(IMSIC_EITHRESHOLD, 0);
+ for (i = 0; i < nr_eix; i++) {
+ imsic_eix_write(IMSIC_EIP0 + i * 2, 0);
+ imsic_eix_write(IMSIC_EIE0 + i * 2, 0);
+#ifdef CONFIG_32BIT
+ imsic_eix_write(IMSIC_EIP0 + i * 2 + 1, 0);
+ imsic_eix_write(IMSIC_EIE0 + i * 2 + 1, 0);
+#endif
+ }
+
+ csr_write(CSR_HSTATUS, old_hstatus);
+ csr_write(CSR_VSISELECT, old_vsiselect);
+}
+
+static void imsic_vsfile_local_update(int vsfile_hgei, u32 nr_eix,
+ struct imsic_mrif *mrif)
+{
+ u32 i;
+ struct imsic_mrif_eix *eix;
+ unsigned long new_hstatus, old_hstatus, old_vsiselect;
+
+ /* We can only update if we have a HW IMSIC context */
+ if (vsfile_hgei <= 0)
+ return;
+
+ /*
+ * We don't use imsic_mrif_atomic_xyz() functions to read values
+ * from MRIF in this function because it is always called with
+ * pointer to temporary MRIF on stack.
+ */
+
+ old_vsiselect = csr_read(CSR_VSISELECT);
+ old_hstatus = csr_read(CSR_HSTATUS);
+ new_hstatus = old_hstatus & ~HSTATUS_VGEIN;
+ new_hstatus |= ((unsigned long)vsfile_hgei) << HSTATUS_VGEIN_SHIFT;
+ csr_write(CSR_HSTATUS, new_hstatus);
+
+ for (i = 0; i < nr_eix; i++) {
+ eix = &mrif->eix[i];
+ imsic_eix_set(IMSIC_EIP0 + i * 2, eix->eip[0]);
+ imsic_eix_set(IMSIC_EIE0 + i * 2, eix->eie[0]);
+#ifdef CONFIG_32BIT
+ imsic_eix_set(IMSIC_EIP0 + i * 2 + 1, eix->eip[1]);
+ imsic_eix_set(IMSIC_EIE0 + i * 2 + 1, eix->eie[1]);
+#endif
+ }
+ imsic_vs_csr_write(IMSIC_EITHRESHOLD, mrif->eithreshold);
+ imsic_vs_csr_write(IMSIC_EIDELIVERY, mrif->eidelivery);
+
+ csr_write(CSR_HSTATUS, old_hstatus);
+ csr_write(CSR_VSISELECT, old_vsiselect);
+}
+
+static void imsic_vsfile_cleanup(struct imsic *imsic)
+{
+ int old_vsfile_hgei, old_vsfile_cpu;
+ unsigned long flags;
+
+ /*
+ * We don't use imsic_mrif_atomic_xyz() functions to clear the
+ * SW-file in this function because it is always called when the
+ * VCPU is being destroyed.
+ */
+
+ write_lock_irqsave(&imsic->vsfile_lock, flags);
+ old_vsfile_hgei = imsic->vsfile_hgei;
+ old_vsfile_cpu = imsic->vsfile_cpu;
+ imsic->vsfile_cpu = imsic->vsfile_hgei = -1;
+ imsic->vsfile_va = NULL;
+ imsic->vsfile_pa = 0;
+ write_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ memset(imsic->swfile, 0, sizeof(*imsic->swfile));
+
+ if (old_vsfile_cpu >= 0)
+ kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei);
+}
+
+static void imsic_swfile_extirq_update(struct kvm_vcpu *vcpu)
+{
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+ struct imsic_mrif *mrif = imsic->swfile;
+
+ if (imsic_mrif_atomic_read(mrif, &mrif->eidelivery) &&
+ imsic_mrif_topei(mrif, imsic->nr_eix, imsic->nr_msis))
+ kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_VS_EXT);
+ else
+ kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT);
+}
+
+static void imsic_swfile_read(struct kvm_vcpu *vcpu, bool clear,
+ struct imsic_mrif *mrif)
+{
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ /*
+ * We don't use imsic_mrif_atomic_xyz() functions to read and
+ * write SW-file and MRIF in this function because it is always
+ * called when VCPU is not using SW-file and the MRIF points to
+ * a temporary MRIF on stack.
+ */
+
+ memcpy(mrif, imsic->swfile, sizeof(*mrif));
+ if (clear) {
+ memset(imsic->swfile, 0, sizeof(*imsic->swfile));
+ kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT);
+ }
+}
+
+static void imsic_swfile_update(struct kvm_vcpu *vcpu,
+ struct imsic_mrif *mrif)
+{
+ u32 i;
+ struct imsic_mrif_eix *seix, *eix;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+ struct imsic_mrif *smrif = imsic->swfile;
+
+ imsic_mrif_atomic_write(smrif, &smrif->eidelivery, mrif->eidelivery);
+ imsic_mrif_atomic_write(smrif, &smrif->eithreshold, mrif->eithreshold);
+ for (i = 0; i < imsic->nr_eix; i++) {
+ seix = &smrif->eix[i];
+ eix = &mrif->eix[i];
+ imsic_mrif_atomic_or(smrif, &seix->eip[0], eix->eip[0]);
+ imsic_mrif_atomic_or(smrif, &seix->eie[0], eix->eie[0]);
+#ifdef CONFIG_32BIT
+ imsic_mrif_atomic_or(smrif, &seix->eip[1], eix->eip[1]);
+ imsic_mrif_atomic_or(smrif, &seix->eie[1], eix->eie[1]);
+#endif
+ }
+
+ imsic_swfile_extirq_update(vcpu);
+}
+
+void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct imsic_mrif tmrif;
+ int old_vsfile_hgei, old_vsfile_cpu;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ /* Read and clear IMSIC VS-file details */
+ write_lock_irqsave(&imsic->vsfile_lock, flags);
+ old_vsfile_hgei = imsic->vsfile_hgei;
+ old_vsfile_cpu = imsic->vsfile_cpu;
+ imsic->vsfile_cpu = imsic->vsfile_hgei = -1;
+ imsic->vsfile_va = NULL;
+ imsic->vsfile_pa = 0;
+ write_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ /* Do nothing, if no IMSIC VS-file to release */
+ if (old_vsfile_cpu < 0)
+ return;
+
+ /*
+ * At this point, all interrupt producers are still using
+ * the old IMSIC VS-file so we first re-direct all interrupt
+ * producers.
+ */
+
+ /* Purge the G-stage mapping */
+ kvm_riscv_gstage_iounmap(vcpu->kvm,
+ vcpu->arch.aia_context.imsic_addr,
+ IMSIC_MMIO_PAGE_SZ);
+
+ /* TODO: Purge the IOMMU mapping ??? */
+
+ /*
+ * At this point, all interrupt producers have been re-directed
+ * to somewhere else so we move register state from the old IMSIC
+ * VS-file to the IMSIC SW-file.
+ */
+
+ /* Read and clear register state from old IMSIC VS-file */
+ memset(&tmrif, 0, sizeof(tmrif));
+ imsic_vsfile_read(old_vsfile_hgei, old_vsfile_cpu, imsic->nr_hw_eix,
+ true, &tmrif);
+
+ /* Update register state in IMSIC SW-file */
+ imsic_swfile_update(vcpu, &tmrif);
+
+ /* Free-up old IMSIC VS-file */
+ kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei);
+}
+
+int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ phys_addr_t new_vsfile_pa;
+ struct imsic_mrif tmrif;
+ void __iomem *new_vsfile_va;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_run *run = vcpu->run;
+ struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context;
+ struct imsic *imsic = vaia->imsic_state;
+ int ret = 0, new_vsfile_hgei = -1, old_vsfile_hgei, old_vsfile_cpu;
+
+ /* Do nothing for emulation mode */
+ if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_EMUL)
+ return 1;
+
+ /* Read old IMSIC VS-file details */
+ read_lock_irqsave(&imsic->vsfile_lock, flags);
+ old_vsfile_hgei = imsic->vsfile_hgei;
+ old_vsfile_cpu = imsic->vsfile_cpu;
+ read_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ /* Do nothing if we are continuing on same CPU */
+ if (old_vsfile_cpu == vcpu->cpu)
+ return 1;
+
+ /* Allocate new IMSIC VS-file */
+ ret = kvm_riscv_aia_alloc_hgei(vcpu->cpu, vcpu,
+ &new_vsfile_va, &new_vsfile_pa);
+ if (ret <= 0) {
+ /* For HW acceleration mode, we can't continue */
+ if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_HWACCEL) {
+ run->fail_entry.hardware_entry_failure_reason =
+ CSR_HSTATUS;
+ run->fail_entry.cpu = vcpu->cpu;
+ run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ return 0;
+ }
+
+ /* Release old IMSIC VS-file */
+ if (old_vsfile_cpu >= 0)
+ kvm_riscv_vcpu_aia_imsic_release(vcpu);
+
+ /* For automatic mode, we continue */
+ goto done;
+ }
+ new_vsfile_hgei = ret;
+
+ /*
+ * At this point, all interrupt producers are still using
+ * to the old IMSIC VS-file so we first move all interrupt
+ * producers to the new IMSIC VS-file.
+ */
+
+ /* Zero-out new IMSIC VS-file */
+ imsic_vsfile_local_clear(new_vsfile_hgei, imsic->nr_hw_eix);
+
+ /* Update G-stage mapping for the new IMSIC VS-file */
+ ret = kvm_riscv_gstage_ioremap(kvm, vcpu->arch.aia_context.imsic_addr,
+ new_vsfile_pa, IMSIC_MMIO_PAGE_SZ,
+ true, true);
+ if (ret)
+ goto fail_free_vsfile_hgei;
+
+ /* TODO: Update the IOMMU mapping ??? */
+
+ /* Update new IMSIC VS-file details in IMSIC context */
+ write_lock_irqsave(&imsic->vsfile_lock, flags);
+ imsic->vsfile_hgei = new_vsfile_hgei;
+ imsic->vsfile_cpu = vcpu->cpu;
+ imsic->vsfile_va = new_vsfile_va;
+ imsic->vsfile_pa = new_vsfile_pa;
+ write_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ /*
+ * At this point, all interrupt producers have been moved
+ * to the new IMSIC VS-file so we move register state from
+ * the old IMSIC VS/SW-file to the new IMSIC VS-file.
+ */
+
+ memset(&tmrif, 0, sizeof(tmrif));
+ if (old_vsfile_cpu >= 0) {
+ /* Read and clear register state from old IMSIC VS-file */
+ imsic_vsfile_read(old_vsfile_hgei, old_vsfile_cpu,
+ imsic->nr_hw_eix, true, &tmrif);
+
+ /* Free-up old IMSIC VS-file */
+ kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei);
+ } else {
+ /* Read and clear register state from IMSIC SW-file */
+ imsic_swfile_read(vcpu, true, &tmrif);
+ }
+
+ /* Restore register state in the new IMSIC VS-file */
+ imsic_vsfile_local_update(new_vsfile_hgei, imsic->nr_hw_eix, &tmrif);
+
+done:
+ /* Set VCPU HSTATUS.VGEIN to new IMSIC VS-file */
+ vcpu->arch.guest_context.hstatus &= ~HSTATUS_VGEIN;
+ if (new_vsfile_hgei > 0)
+ vcpu->arch.guest_context.hstatus |=
+ ((unsigned long)new_vsfile_hgei) << HSTATUS_VGEIN_SHIFT;
+
+ /* Continue run-loop */
+ return 1;
+
+fail_free_vsfile_hgei:
+ kvm_riscv_aia_free_hgei(vcpu->cpu, new_vsfile_hgei);
+ return ret;
+}
+
+int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask)
+{
+ u32 topei;
+ struct imsic_mrif_eix *eix;
+ int r, rc = KVM_INSN_CONTINUE_NEXT_SEPC;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ if (isel == KVM_RISCV_AIA_IMSIC_TOPEI) {
+ /* Read pending and enabled interrupt with highest priority */
+ topei = imsic_mrif_topei(imsic->swfile, imsic->nr_eix,
+ imsic->nr_msis);
+ if (val)
+ *val = topei;
+
+ /* Writes ignore value and clear top pending interrupt */
+ if (topei && wr_mask) {
+ topei >>= TOPEI_ID_SHIFT;
+ if (topei) {
+ eix = &imsic->swfile->eix[topei /
+ BITS_PER_TYPE(u64)];
+ clear_bit(topei & (BITS_PER_TYPE(u64) - 1),
+ eix->eip);
+ }
+ }
+ } else {
+ r = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix, isel,
+ val, new_val, wr_mask);
+ /* Forward unknown IMSIC register to user-space */
+ if (r)
+ rc = (r == -ENOENT) ? 0 : KVM_INSN_ILLEGAL_TRAP;
+ }
+
+ if (wr_mask)
+ imsic_swfile_extirq_update(vcpu);
+
+ return rc;
+}
+
+int kvm_riscv_aia_imsic_rw_attr(struct kvm *kvm, unsigned long type,
+ bool write, unsigned long *val)
+{
+ u32 isel, vcpu_id;
+ unsigned long flags;
+ struct imsic *imsic;
+ struct kvm_vcpu *vcpu;
+ int rc, vsfile_hgei, vsfile_cpu;
+
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -ENODEV;
+
+ vcpu_id = KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(type);
+ vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ if (!vcpu)
+ return -ENODEV;
+
+ isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type);
+ imsic = vcpu->arch.aia_context.imsic_state;
+
+ read_lock_irqsave(&imsic->vsfile_lock, flags);
+
+ rc = 0;
+ vsfile_hgei = imsic->vsfile_hgei;
+ vsfile_cpu = imsic->vsfile_cpu;
+ if (vsfile_cpu < 0) {
+ if (write) {
+ rc = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix,
+ isel, NULL, *val, -1UL);
+ imsic_swfile_extirq_update(vcpu);
+ } else
+ rc = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix,
+ isel, val, 0, 0);
+ }
+
+ read_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ if (!rc && vsfile_cpu >= 0)
+ rc = imsic_vsfile_rw(vsfile_hgei, vsfile_cpu, imsic->nr_eix,
+ isel, write, val);
+
+ return rc;
+}
+
+int kvm_riscv_aia_imsic_has_attr(struct kvm *kvm, unsigned long type)
+{
+ u32 isel, vcpu_id;
+ struct imsic *imsic;
+ struct kvm_vcpu *vcpu;
+
+ if (!kvm_riscv_aia_initialized(kvm))
+ return -ENODEV;
+
+ vcpu_id = KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(type);
+ vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ if (!vcpu)
+ return -ENODEV;
+
+ isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type);
+ imsic = vcpu->arch.aia_context.imsic_state;
+ return imsic_mrif_isel_check(imsic->nr_eix, isel);
+}
+
+void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu)
+{
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ if (!imsic)
+ return;
+
+ kvm_riscv_vcpu_aia_imsic_release(vcpu);
+
+ memset(imsic->swfile, 0, sizeof(*imsic->swfile));
+}
+
+int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu,
+ u32 guest_index, u32 offset, u32 iid)
+{
+ unsigned long flags;
+ struct imsic_mrif_eix *eix;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ /* We only emulate one IMSIC MMIO page for each Guest VCPU */
+ if (!imsic || !iid || guest_index ||
+ (offset != IMSIC_MMIO_SETIPNUM_LE &&
+ offset != IMSIC_MMIO_SETIPNUM_BE))
+ return -ENODEV;
+
+ iid = (offset == IMSIC_MMIO_SETIPNUM_BE) ? __swab32(iid) : iid;
+ if (imsic->nr_msis <= iid)
+ return -EINVAL;
+
+ read_lock_irqsave(&imsic->vsfile_lock, flags);
+
+ if (imsic->vsfile_cpu >= 0) {
+ writel(iid, imsic->vsfile_va + IMSIC_MMIO_SETIPNUM_LE);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ eix = &imsic->swfile->eix[iid / BITS_PER_TYPE(u64)];
+ set_bit(iid & (BITS_PER_TYPE(u64) - 1), eix->eip);
+ imsic_swfile_extirq_update(vcpu);
+ }
+
+ read_unlock_irqrestore(&imsic->vsfile_lock, flags);
+
+ return 0;
+}
+
+static int imsic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ if (len != 4 || (addr & 0x3) != 0)
+ return -EOPNOTSUPP;
+
+ *((u32 *)val) = 0;
+
+ return 0;
+}
+
+static int imsic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_msi msi = { 0 };
+
+ if (len != 4 || (addr & 0x3) != 0)
+ return -EOPNOTSUPP;
+
+ msi.address_hi = addr >> 32;
+ msi.address_lo = (u32)addr;
+ msi.data = *((const u32 *)val);
+ kvm_riscv_aia_inject_msi(vcpu->kvm, &msi);
+
+ return 0;
+};
+
+static struct kvm_io_device_ops imsic_iodoev_ops = {
+ .read = imsic_mmio_read,
+ .write = imsic_mmio_write,
+};
+
+int kvm_riscv_vcpu_aia_imsic_init(struct kvm_vcpu *vcpu)
+{
+ int ret = 0;
+ struct imsic *imsic;
+ struct page *swfile_page;
+ struct kvm *kvm = vcpu->kvm;
+
+ /* Fail if we have zero IDs */
+ if (!kvm->arch.aia.nr_ids)
+ return -EINVAL;
+
+ /* Allocate IMSIC context */
+ imsic = kzalloc(sizeof(*imsic), GFP_KERNEL);
+ if (!imsic)
+ return -ENOMEM;
+ vcpu->arch.aia_context.imsic_state = imsic;
+
+ /* Setup IMSIC context */
+ imsic->nr_msis = kvm->arch.aia.nr_ids + 1;
+ rwlock_init(&imsic->vsfile_lock);
+ imsic->nr_eix = BITS_TO_U64(imsic->nr_msis);
+ imsic->nr_hw_eix = BITS_TO_U64(kvm_riscv_aia_max_ids);
+ imsic->vsfile_hgei = imsic->vsfile_cpu = -1;
+
+ /* Setup IMSIC SW-file */
+ swfile_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(sizeof(*imsic->swfile)));
+ if (!swfile_page) {
+ ret = -ENOMEM;
+ goto fail_free_imsic;
+ }
+ imsic->swfile = page_to_virt(swfile_page);
+ imsic->swfile_pa = page_to_phys(swfile_page);
+
+ /* Setup IO device */
+ kvm_iodevice_init(&imsic->iodev, &imsic_iodoev_ops);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+ vcpu->arch.aia_context.imsic_addr,
+ KVM_DEV_RISCV_IMSIC_SIZE,
+ &imsic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret)
+ goto fail_free_swfile;
+
+ return 0;
+
+fail_free_swfile:
+ free_pages((unsigned long)imsic->swfile,
+ get_order(sizeof(*imsic->swfile)));
+fail_free_imsic:
+ vcpu->arch.aia_context.imsic_state = NULL;
+ kfree(imsic);
+ return ret;
+}
+
+void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct imsic *imsic = vcpu->arch.aia_context.imsic_state;
+
+ if (!imsic)
+ return;
+
+ imsic_vsfile_cleanup(imsic);
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &imsic->iodev);
+ mutex_unlock(&kvm->slots_lock);
+
+ free_pages((unsigned long)imsic->swfile,
+ get_order(sizeof(*imsic->swfile)));
+
+ vcpu->arch.aia_context.imsic_state = NULL;
+ kfree(imsic);
+}
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index a7112d583637..48ae0d4b3932 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -116,7 +116,8 @@ static int __init riscv_kvm_init(void)
kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
if (kvm_riscv_aia_available())
- kvm_info("AIA available\n");
+ kvm_info("AIA available with %d guest external interrupts\n",
+ kvm_riscv_aia_nr_hgei);
rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
if (rc) {
diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c
index 0e5479600695..44bc324aeeb0 100644
--- a/arch/riscv/kvm/tlb.c
+++ b/arch/riscv/kvm/tlb.c
@@ -296,7 +296,7 @@ static void make_xfence_request(struct kvm *kvm,
unsigned int actual_req = req;
DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
- bitmap_clear(vcpu_mask, 0, KVM_MAX_VCPUS);
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
kvm_for_each_vcpu(i, vcpu, kvm) {
if (hbase != -1UL) {
if (vcpu->vcpu_id < hbase)
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index de24127e7e93..d12ef99901fc 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -64,6 +64,7 @@ static const unsigned long kvm_isa_ext_arr[] = {
KVM_ISA_EXT_ARR(SSAIA),
KVM_ISA_EXT_ARR(SSTC),
KVM_ISA_EXT_ARR(SVINVAL),
+ KVM_ISA_EXT_ARR(SVNAPOT),
KVM_ISA_EXT_ARR(SVPBMT),
KVM_ISA_EXT_ARR(ZBB),
KVM_ISA_EXT_ARR(ZIHINTPAUSE),
@@ -107,6 +108,7 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext)
case KVM_RISCV_ISA_EXT_SSAIA:
case KVM_RISCV_ISA_EXT_SSTC:
case KVM_RISCV_ISA_EXT_SVINVAL:
+ case KVM_RISCV_ISA_EXT_SVNAPOT:
case KVM_RISCV_ISA_EXT_ZIHINTPAUSE:
case KVM_RISCV_ISA_EXT_ZBB:
return false;
@@ -263,10 +265,12 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
+ kvm_riscv_aia_wakeon_hgei(vcpu, true);
}
void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
+ kvm_riscv_aia_wakeon_hgei(vcpu, false);
}
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
index 4ea101a73d8b..2415722c01b8 100644
--- a/arch/riscv/kvm/vcpu_exit.c
+++ b/arch/riscv/kvm/vcpu_exit.c
@@ -183,6 +183,8 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
run->exit_reason = KVM_EXIT_UNKNOWN;
switch (trap->scause) {
case EXC_INST_ILLEGAL:
+ case EXC_LOAD_MISALIGNED:
+ case EXC_STORE_MISALIGNED:
if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) {
kvm_riscv_vcpu_trap_redirect(vcpu, trap);
ret = 1;
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
index e52fde504433..7b46e04fb667 100644
--- a/arch/riscv/kvm/vcpu_sbi.c
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -20,9 +20,7 @@ static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = {
};
#endif
-#ifdef CONFIG_RISCV_PMU_SBI
-extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu;
-#else
+#ifndef CONFIG_RISCV_PMU_SBI
static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = {
.extid_start = -1UL,
.extid_end = -1UL,
@@ -31,49 +29,49 @@ static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = {
#endif
struct kvm_riscv_sbi_extension_entry {
- enum KVM_RISCV_SBI_EXT_ID dis_idx;
+ enum KVM_RISCV_SBI_EXT_ID ext_idx;
const struct kvm_vcpu_sbi_extension *ext_ptr;
};
static const struct kvm_riscv_sbi_extension_entry sbi_ext[] = {
{
- .dis_idx = KVM_RISCV_SBI_EXT_V01,
+ .ext_idx = KVM_RISCV_SBI_EXT_V01,
.ext_ptr = &vcpu_sbi_ext_v01,
},
{
- .dis_idx = KVM_RISCV_SBI_EXT_MAX, /* Can't be disabled */
+ .ext_idx = KVM_RISCV_SBI_EXT_MAX, /* Can't be disabled */
.ext_ptr = &vcpu_sbi_ext_base,
},
{
- .dis_idx = KVM_RISCV_SBI_EXT_TIME,
+ .ext_idx = KVM_RISCV_SBI_EXT_TIME,
.ext_ptr = &vcpu_sbi_ext_time,
},
{
- .dis_idx = KVM_RISCV_SBI_EXT_IPI,
+ .ext_idx = KVM_RISCV_SBI_EXT_IPI,
.ext_ptr = &vcpu_sbi_ext_ipi,
},
{
- .dis_idx = KVM_RISCV_SBI_EXT_RFENCE,
+ .ext_idx = KVM_RISCV_SBI_EXT_RFENCE,
.ext_ptr = &vcpu_sbi_ext_rfence,
},
{
- .dis_idx = KVM_RISCV_SBI_EXT_SRST,
+ .ext_idx = KVM_RISCV_SBI_EXT_SRST,
.ext_ptr = &vcpu_sbi_ext_srst,
},
{
- .dis_idx = KVM_RISCV_SBI_EXT_HSM,
+ .ext_idx = KVM_RISCV_SBI_EXT_HSM,
.ext_ptr = &vcpu_sbi_ext_hsm,
},
{
- .dis_idx = KVM_RISCV_SBI_EXT_PMU,
+ .ext_idx = KVM_RISCV_SBI_EXT_PMU,
.ext_ptr = &vcpu_sbi_ext_pmu,
},
{
- .dis_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL,
+ .ext_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL,
.ext_ptr = &vcpu_sbi_ext_experimental,
},
{
- .dis_idx = KVM_RISCV_SBI_EXT_VENDOR,
+ .ext_idx = KVM_RISCV_SBI_EXT_VENDOR,
.ext_ptr = &vcpu_sbi_ext_vendor,
},
};
@@ -147,7 +145,7 @@ static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu,
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
- if (sbi_ext[i].dis_idx == reg_num) {
+ if (sbi_ext[i].ext_idx == reg_num) {
sext = &sbi_ext[i];
break;
}
@@ -155,7 +153,15 @@ static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu,
if (!sext)
return -ENOENT;
- scontext->extension_disabled[sext->dis_idx] = !reg_val;
+ /*
+ * We can't set the extension status to available here, since it may
+ * have a probe() function which needs to confirm availability first,
+ * but it may be too early to call that here. We can set the status to
+ * unavailable, though.
+ */
+ if (!reg_val)
+ scontext->ext_status[sext->ext_idx] =
+ KVM_RISCV_SBI_EXT_UNAVAILABLE;
return 0;
}
@@ -172,7 +178,7 @@ static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu,
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
- if (sbi_ext[i].dis_idx == reg_num) {
+ if (sbi_ext[i].ext_idx == reg_num) {
sext = &sbi_ext[i];
break;
}
@@ -180,7 +186,15 @@ static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu,
if (!sext)
return -ENOENT;
- *reg_val = !scontext->extension_disabled[sext->dis_idx];
+ /*
+ * If the extension status is still uninitialized, then we should probe
+ * to determine if it's available, but it may be too early to do that
+ * here. The best we can do is report that the extension has not been
+ * disabled, i.e. we return 1 when the extension is available and also
+ * when it only may be available.
+ */
+ *reg_val = scontext->ext_status[sext->ext_idx] !=
+ KVM_RISCV_SBI_EXT_UNAVAILABLE;
return 0;
}
@@ -307,18 +321,32 @@ int kvm_riscv_vcpu_get_reg_sbi_ext(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext(
struct kvm_vcpu *vcpu, unsigned long extid)
{
- int i;
- const struct kvm_riscv_sbi_extension_entry *sext;
struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context;
+ const struct kvm_riscv_sbi_extension_entry *entry;
+ const struct kvm_vcpu_sbi_extension *ext;
+ int i;
for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) {
- sext = &sbi_ext[i];
- if (sext->ext_ptr->extid_start <= extid &&
- sext->ext_ptr->extid_end >= extid) {
- if (sext->dis_idx < KVM_RISCV_SBI_EXT_MAX &&
- scontext->extension_disabled[sext->dis_idx])
+ entry = &sbi_ext[i];
+ ext = entry->ext_ptr;
+
+ if (ext->extid_start <= extid && ext->extid_end >= extid) {
+ if (entry->ext_idx >= KVM_RISCV_SBI_EXT_MAX ||
+ scontext->ext_status[entry->ext_idx] ==
+ KVM_RISCV_SBI_EXT_AVAILABLE)
+ return ext;
+ if (scontext->ext_status[entry->ext_idx] ==
+ KVM_RISCV_SBI_EXT_UNAVAILABLE)
return NULL;
- return sbi_ext[i].ext_ptr;
+ if (ext->probe && !ext->probe(vcpu)) {
+ scontext->ext_status[entry->ext_idx] =
+ KVM_RISCV_SBI_EXT_UNAVAILABLE;
+ return NULL;
+ }
+
+ scontext->ext_status[entry->ext_idx] =
+ KVM_RISCV_SBI_EXT_AVAILABLE;
+ return ext;
}
}
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index 6ef15f78e80f..7e2b50c692c1 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -55,11 +55,129 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_riscv_aia_destroy_vm(kvm);
}
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irql,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ return kvm_riscv_aia_inject_irq(kvm, irql->irq, irql->level);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id,
+ int level, bool line_status)
+{
+ struct kvm_msi msi;
+
+ if (!level)
+ return -1;
+
+ msi.address_lo = e->msi.address_lo;
+ msi.address_hi = e->msi.address_hi;
+ msi.data = e->msi.data;
+ msi.flags = e->msi.flags;
+ msi.devid = e->msi.devid;
+
+ return kvm_riscv_aia_inject_msi(kvm, &msi);
+}
+
+static int kvm_riscv_set_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id,
+ int level, bool line_status)
+{
+ return kvm_riscv_aia_inject_irq(kvm, e->irqchip.pin, level);
+}
+
+int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines)
+{
+ struct kvm_irq_routing_entry *ents;
+ int i, rc;
+
+ ents = kcalloc(lines, sizeof(*ents), GFP_KERNEL);
+ if (!ents)
+ return -ENOMEM;
+
+ for (i = 0; i < lines; i++) {
+ ents[i].gsi = i;
+ ents[i].type = KVM_IRQ_ROUTING_IRQCHIP;
+ ents[i].u.irqchip.irqchip = 0;
+ ents[i].u.irqchip.pin = i;
+ }
+ rc = kvm_set_irq_routing(kvm, ents, lines, 0);
+ kfree(ents);
+
+ return rc;
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ int r = -EINVAL;
+
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ e->set = kvm_riscv_set_irq;
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) ||
+ (e->irqchip.irqchip >= KVM_NR_IRQCHIPS))
+ goto out;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+ e->msi.flags = ue->flags;
+ e->msi.devid = ue->u.msi.devid;
+ break;
+ default:
+ goto out;
+ }
+ r = 0;
+out:
+ return r;
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ if (!level)
+ return -EWOULDBLOCK;
+
+ switch (e->type) {
+ case KVM_IRQ_ROUTING_MSI:
+ return kvm_set_msi(e, kvm, irq_source_id, level, line_status);
+
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ return kvm_riscv_set_irq(e, kvm, irq_source_id,
+ level, line_status);
+ }
+
+ return -EWOULDBLOCK;
+}
+
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
switch (ext) {
+ case KVM_CAP_IRQCHIP:
+ r = kvm_riscv_aia_available();
+ break;
case KVM_CAP_IOEVENTFD:
case KVM_CAP_DEVICE_CTRL:
case KVM_CAP_USER_MEMORY:
diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c
index 0a077c0a2056..1e66d2cbb096 100644
--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -47,6 +47,10 @@ void uv_query_info(void)
uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len;
uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver;
uv_info.supp_att_pflags = uvcb.supp_att_pflags;
+ uv_info.supp_add_secret_req_ver = uvcb.supp_add_secret_req_ver;
+ uv_info.supp_add_secret_pcf = uvcb.supp_add_secret_pcf;
+ uv_info.supp_secret_types = uvcb.supp_secret_types;
+ uv_info.max_secrets = uvcb.max_secrets;
}
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index 28a9ad57b6f1..d6bb2f4f78d1 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -58,6 +58,9 @@
#define UVC_CMD_SET_SHARED_ACCESS 0x1000
#define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001
#define UVC_CMD_RETR_ATTEST 0x1020
+#define UVC_CMD_ADD_SECRET 0x1031
+#define UVC_CMD_LIST_SECRETS 0x1033
+#define UVC_CMD_LOCK_SECRETS 0x1034
/* Bits in installed uv calls */
enum uv_cmds_inst {
@@ -88,6 +91,9 @@ enum uv_cmds_inst {
BIT_UVC_CMD_DUMP_CPU = 26,
BIT_UVC_CMD_DUMP_COMPLETE = 27,
BIT_UVC_CMD_RETR_ATTEST = 28,
+ BIT_UVC_CMD_ADD_SECRET = 29,
+ BIT_UVC_CMD_LIST_SECRETS = 30,
+ BIT_UVC_CMD_LOCK_SECRETS = 31,
};
enum uv_feat_ind {
@@ -117,7 +123,7 @@ struct uv_cb_qui {
u32 reserved70[3]; /* 0x0070 */
u32 max_num_sec_conf; /* 0x007c */
u64 max_guest_stor_addr; /* 0x0080 */
- u8 reserved88[158 - 136]; /* 0x0088 */
+ u8 reserved88[0x9e - 0x88]; /* 0x0088 */
u16 max_guest_cpu_id; /* 0x009e */
u64 uv_feature_indications; /* 0x00a0 */
u64 reserveda8; /* 0x00a8 */
@@ -129,7 +135,12 @@ struct uv_cb_qui {
u64 reservedd8; /* 0x00d8 */
u64 supp_att_req_hdr_ver; /* 0x00e0 */
u64 supp_att_pflags; /* 0x00e8 */
- u8 reservedf0[256 - 240]; /* 0x00f0 */
+ u64 reservedf0; /* 0x00f0 */
+ u64 supp_add_secret_req_ver; /* 0x00f8 */
+ u64 supp_add_secret_pcf; /* 0x0100 */
+ u64 supp_secret_types; /* 0x0180 */
+ u16 max_secrets; /* 0x0110 */
+ u8 reserved112[0x120 - 0x112]; /* 0x0112 */
} __packed __aligned(8);
/* Initialize Ultravisor */
@@ -292,6 +303,19 @@ struct uv_cb_dump_complete {
u64 reserved30[5];
} __packed __aligned(8);
+/*
+ * A common UV call struct for pv guests that contains a single address
+ * Examples:
+ * Add Secret
+ * List Secrets
+ */
+struct uv_cb_guest_addr {
+ struct uv_cb_header header;
+ u64 reserved08[3];
+ u64 addr;
+ u64 reserved28[4];
+} __packed __aligned(8);
+
static inline int __uv_call(unsigned long r1, unsigned long r2)
{
int cc;
@@ -365,6 +389,10 @@ struct uv_info {
unsigned long conf_dump_finalize_len;
unsigned long supp_att_req_hdr_ver;
unsigned long supp_att_pflags;
+ unsigned long supp_add_secret_req_ver;
+ unsigned long supp_add_secret_pcf;
+ unsigned long supp_secret_types;
+ unsigned short max_secrets;
};
extern struct uv_info uv_info;
diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h
index 10a5ac918e02..b9c2f14a6af3 100644
--- a/arch/s390/include/uapi/asm/uvdevice.h
+++ b/arch/s390/include/uapi/asm/uvdevice.h
@@ -32,6 +32,33 @@ struct uvio_attest {
__u16 reserved136; /* 0x0136 */
};
+/**
+ * uvio_uvdev_info - Information of supported functions
+ * @supp_uvio_cmds - supported IOCTLs by this device
+ * @supp_uv_cmds - supported UVCs corresponding to the IOCTL
+ *
+ * UVIO request to get information about supported request types by this
+ * uvdevice and the Ultravisor. Everything is output. Bits are in LSB0
+ * ordering. If the bit is set in both, @supp_uvio_cmds and @supp_uv_cmds, the
+ * uvdevice and the Ultravisor support that call.
+ *
+ * Note that bit 0 (UVIO_IOCTL_UVDEV_INFO_NR) is always zero for `supp_uv_cmds`
+ * as there is no corresponding UV-call.
+ */
+struct uvio_uvdev_info {
+ /*
+ * If bit `n` is set, this device supports the IOCTL with nr `n`.
+ */
+ __u64 supp_uvio_cmds;
+ /*
+ * If bit `n` is set, the Ultravisor(UV) supports the UV-call
+ * corresponding to the IOCTL with nr `n` in the calling contextx (host
+ * or guest). The value is only valid if the corresponding bit in
+ * @supp_uvio_cmds is set as well.
+ */
+ __u64 supp_uv_cmds;
+};
+
/*
* The following max values define an upper length for the IOCTL in/out buffers.
* However, they do not represent the maximum the Ultravisor allows which is
@@ -42,10 +69,34 @@ struct uvio_attest {
#define UVIO_ATT_ARCB_MAX_LEN 0x100000
#define UVIO_ATT_MEASUREMENT_MAX_LEN 0x8000
#define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000
+#define UVIO_ADD_SECRET_MAX_LEN 0x100000
+#define UVIO_LIST_SECRETS_LEN 0x1000
#define UVIO_DEVICE_NAME "uv"
#define UVIO_TYPE_UVC 'u'
-#define UVIO_IOCTL_ATT _IOWR(UVIO_TYPE_UVC, 0x01, struct uvio_ioctl_cb)
+enum UVIO_IOCTL_NR {
+ UVIO_IOCTL_UVDEV_INFO_NR = 0x00,
+ UVIO_IOCTL_ATT_NR,
+ UVIO_IOCTL_ADD_SECRET_NR,
+ UVIO_IOCTL_LIST_SECRETS_NR,
+ UVIO_IOCTL_LOCK_SECRETS_NR,
+ /* must be the last entry */
+ UVIO_IOCTL_NUM_IOCTLS
+};
+
+#define UVIO_IOCTL(nr) _IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb)
+#define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR)
+#define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR)
+#define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR)
+#define UVIO_IOCTL_LIST_SECRETS UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR)
+#define UVIO_IOCTL_LOCK_SECRETS UVIO_IOCTL(UVIO_IOCTL_LOCK_SECRETS_NR)
+
+#define UVIO_SUPP_CALL(nr) (1ULL << (nr))
+#define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR)
+#define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR)
+#define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR)
+#define UVIO_SUPP_LIST_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR)
+#define UVIO_SUPP_LOCK_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LOCK_SECRETS_NR)
#endif /* __S390_ASM_UVDEVICE_H */
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 3c62d1b218b1..66f0eb1c872b 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -23,12 +23,20 @@
int __bootdata_preserved(prot_virt_guest);
#endif
+/*
+ * uv_info contains both host and guest information but it's currently only
+ * expected to be used within modules if it's the KVM module or for
+ * any PV guest module.
+ *
+ * The kernel itself will write these values once in uv_query_info()
+ * and then make some of them readable via a sysfs interface.
+ */
struct uv_info __bootdata_preserved(uv_info);
+EXPORT_SYMBOL(uv_info);
#if IS_ENABLED(CONFIG_KVM)
int __bootdata_preserved(prot_virt_host);
EXPORT_SYMBOL(prot_virt_host);
-EXPORT_SYMBOL(uv_info);
static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len)
{
@@ -462,13 +470,13 @@ EXPORT_SYMBOL_GPL(arch_make_page_accessible);
#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
static ssize_t uv_query_facilities(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
- return scnprintf(page, PAGE_SIZE, "%lx\n%lx\n%lx\n%lx\n",
- uv_info.inst_calls_list[0],
- uv_info.inst_calls_list[1],
- uv_info.inst_calls_list[2],
- uv_info.inst_calls_list[3]);
+ return sysfs_emit(buf, "%lx\n%lx\n%lx\n%lx\n",
+ uv_info.inst_calls_list[0],
+ uv_info.inst_calls_list[1],
+ uv_info.inst_calls_list[2],
+ uv_info.inst_calls_list[3]);
}
static struct kobj_attribute uv_query_facilities_attr =
@@ -493,30 +501,27 @@ static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr =
__ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL);
static ssize_t uv_query_dump_cpu_len(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
- return scnprintf(page, PAGE_SIZE, "%lx\n",
- uv_info.guest_cpu_stor_len);
+ return sysfs_emit(buf, "%lx\n", uv_info.guest_cpu_stor_len);
}
static struct kobj_attribute uv_query_dump_cpu_len_attr =
__ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL);
static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
- return scnprintf(page, PAGE_SIZE, "%lx\n",
- uv_info.conf_dump_storage_state_len);
+ return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_storage_state_len);
}
static struct kobj_attribute uv_query_dump_storage_state_len_attr =
__ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL);
static ssize_t uv_query_dump_finalize_len(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
- return scnprintf(page, PAGE_SIZE, "%lx\n",
- uv_info.conf_dump_finalize_len);
+ return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_finalize_len);
}
static struct kobj_attribute uv_query_dump_finalize_len_attr =
@@ -532,53 +537,86 @@ static struct kobj_attribute uv_query_feature_indications_attr =
__ATTR(feature_indications, 0444, uv_query_feature_indications, NULL);
static ssize_t uv_query_max_guest_cpus(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
- return scnprintf(page, PAGE_SIZE, "%d\n",
- uv_info.max_guest_cpu_id + 1);
+ return sysfs_emit(buf, "%d\n", uv_info.max_guest_cpu_id + 1);
}
static struct kobj_attribute uv_query_max_guest_cpus_attr =
__ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL);
static ssize_t uv_query_max_guest_vms(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
- return scnprintf(page, PAGE_SIZE, "%d\n",
- uv_info.max_num_sec_conf);
+ return sysfs_emit(buf, "%d\n", uv_info.max_num_sec_conf);
}
static struct kobj_attribute uv_query_max_guest_vms_attr =
__ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL);
static ssize_t uv_query_max_guest_addr(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
- return scnprintf(page, PAGE_SIZE, "%lx\n",
- uv_info.max_sec_stor_addr);
+ return sysfs_emit(buf, "%lx\n", uv_info.max_sec_stor_addr);
}
static struct kobj_attribute uv_query_max_guest_addr_attr =
__ATTR(max_address, 0444, uv_query_max_guest_addr, NULL);
static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
- return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_req_hdr_ver);
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_att_req_hdr_ver);
}
static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr =
__ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL);
static ssize_t uv_query_supp_att_pflags(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
- return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_pflags);
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_att_pflags);
}
static struct kobj_attribute uv_query_supp_att_pflags_attr =
__ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL);
+static ssize_t uv_query_supp_add_secret_req_ver(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_req_ver);
+}
+
+static struct kobj_attribute uv_query_supp_add_secret_req_ver_attr =
+ __ATTR(supp_add_secret_req_ver, 0444, uv_query_supp_add_secret_req_ver, NULL);
+
+static ssize_t uv_query_supp_add_secret_pcf(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_pcf);
+}
+
+static struct kobj_attribute uv_query_supp_add_secret_pcf_attr =
+ __ATTR(supp_add_secret_pcf, 0444, uv_query_supp_add_secret_pcf, NULL);
+
+static ssize_t uv_query_supp_secret_types(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lx\n", uv_info.supp_secret_types);
+}
+
+static struct kobj_attribute uv_query_supp_secret_types_attr =
+ __ATTR(supp_secret_types, 0444, uv_query_supp_secret_types, NULL);
+
+static ssize_t uv_query_max_secrets(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", uv_info.max_secrets);
+}
+
+static struct kobj_attribute uv_query_max_secrets_attr =
+ __ATTR(max_secrets, 0444, uv_query_max_secrets, NULL);
+
static struct attribute *uv_query_attrs[] = {
&uv_query_facilities_attr.attr,
&uv_query_feature_indications_attr.attr,
@@ -592,6 +630,10 @@ static struct attribute *uv_query_attrs[] = {
&uv_query_dump_cpu_len_attr.attr,
&uv_query_supp_att_req_hdr_ver_attr.attr,
&uv_query_supp_att_pflags_attr.attr,
+ &uv_query_supp_add_secret_req_ver_attr.attr,
+ &uv_query_supp_add_secret_pcf_attr.attr,
+ &uv_query_supp_secret_types_attr.attr,
+ &uv_query_max_secrets_attr.attr,
NULL,
};
@@ -600,18 +642,18 @@ static struct attribute_group uv_query_attr_group = {
};
static ssize_t uv_is_prot_virt_guest(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
int val = 0;
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
val = prot_virt_guest;
#endif
- return scnprintf(page, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
static ssize_t uv_is_prot_virt_host(struct kobject *kobj,
- struct kobj_attribute *attr, char *page)
+ struct kobj_attribute *attr, char *buf)
{
int val = 0;
@@ -619,7 +661,7 @@ static ssize_t uv_is_prot_virt_host(struct kobject *kobj,
val = prot_virt_host;
#endif
- return scnprintf(page, PAGE_SIZE, "%d\n", val);
+ return sysfs_emit(buf, "%d\n", val);
}
static struct kobj_attribute uv_prot_virt_guest =
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 807fa9da1e72..3c65b8258ae6 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -166,6 +166,7 @@ static int diag9c_forwarding_overrun(void)
static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *tcpu;
+ int tcpu_cpu;
int tid;
tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
@@ -181,14 +182,15 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
goto no_yield;
/* target guest VCPU already running */
- if (READ_ONCE(tcpu->cpu) >= 0) {
+ tcpu_cpu = READ_ONCE(tcpu->cpu);
+ if (tcpu_cpu >= 0) {
if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
goto no_yield;
/* target host CPU already running */
- if (!vcpu_is_preempted(tcpu->cpu))
+ if (!vcpu_is_preempted(tcpu_cpu))
goto no_yield;
- smp_yield_cpu(tcpu->cpu);
+ smp_yield_cpu(tcpu_cpu);
VCPU_EVENT(vcpu, 5,
"diag time slice end directed to %d: yield forwarded",
tid);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 17b81659cdb2..670019696464 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2156,6 +2156,10 @@ static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
ofs = 0;
}
+
+ if (cur_gfn < ms->base_gfn)
+ ofs = 0;
+
ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 8d6b765abf29..0333ee482eb8 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -177,7 +177,8 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
sizeof(struct kvm_s390_apcb0)))
return -EFAULT;
- bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0));
+ bitmap_and(apcb_s, apcb_s, apcb_h,
+ BITS_PER_BYTE * sizeof(struct kvm_s390_apcb0));
return 0;
}
@@ -203,7 +204,8 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
sizeof(struct kvm_s390_apcb1)))
return -EFAULT;
- bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1));
+ bitmap_and(apcb_s, apcb_s, apcb_h,
+ BITS_PER_BYTE * sizeof(struct kvm_s390_apcb1));
return 0;
}
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
index c17e3e96fc1d..6c98f4bb4228 100644
--- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -13,7 +13,6 @@ BUILD_BUG_ON(1)
* at the call sites.
*/
KVM_X86_PMU_OP(hw_event_available)
-KVM_X86_PMU_OP(pmc_is_enabled)
KVM_X86_PMU_OP(pmc_idx_to_pmc)
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
KVM_X86_PMU_OP(msr_idx_to_pmc)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fb9d1f2d6136..28bd38303d70 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -523,7 +523,7 @@ struct kvm_pmu {
u64 global_status;
u64 counter_bitmask[2];
u64 global_ctrl_mask;
- u64 global_ovf_ctrl_mask;
+ u64 global_status_mask;
u64 reserved_bits;
u64 raw_event_mask;
struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0c9660a07b23..7f4d13383cf2 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -501,20 +501,15 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries)
{
- int r;
-
- r = -E2BIG;
if (cpuid->nent < vcpu->arch.cpuid_nent)
- goto out;
- r = -EFAULT;
+ return -E2BIG;
+
if (copy_to_user(entries, vcpu->arch.cpuid_entries,
vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
- goto out;
- return 0;
+ return -EFAULT;
-out:
cpuid->nent = vcpu->arch.cpuid_nent;
- return r;
+ return 0;
}
/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */
@@ -734,6 +729,10 @@ void kvm_set_cpu_caps(void)
F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
);
+ kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
+ F(PERFMON_V2)
+ );
+
/*
* Synthesize "LFENCE is serializing" into the AMD-defined entry in
* KVM's supported CPUID if the feature is reported as supported by the
@@ -948,7 +947,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
union cpuid10_eax eax;
union cpuid10_edx edx;
- if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
break;
}
@@ -1128,7 +1127,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = 0;
break;
case 0x80000000:
- entry->eax = min(entry->eax, 0x80000021);
+ entry->eax = min(entry->eax, 0x80000022);
/*
* Serializing LFENCE is reported in a multitude of ways, and
* NullSegClearsBase is not reported in CPUID on Zen2; help
@@ -1233,6 +1232,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx = entry->ecx = entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0021_EAX);
break;
+ /* AMD Extended Performance Monitoring and Debug */
+ case 0x80000022: {
+ union cpuid_0x80000022_ebx ebx;
+
+ entry->ecx = entry->edx = 0;
+ if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
+ entry->eax = entry->ebx;
+ break;
+ }
+
+ cpuid_entry_override(entry, CPUID_8000_0022_EAX);
+
+ if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
+ ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp;
+ else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
+ ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE;
+ else
+ ebx.split.num_core_pmc = AMD64_NUM_COUNTERS;
+
+ entry->ebx = ebx.full;
+ break;
+ }
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
/*Just support up to 0xC0000004 now*/
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 4756bcb5724f..8dec646e764b 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -411,7 +411,10 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
pic_clear_isr(s, ret);
if (addr1 >> 7 || ret != 2)
pic_update_irq(s->pics_state);
+ /* Bit 7 is 1, means there's an interrupt */
+ ret |= 0x80;
} else {
+ /* Bit 7 is 0, means there's no interrupt */
ret = 0x07;
pic_update_irq(s->pics_state);
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3c300a196bdf..113ca9661ab2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -51,11 +51,6 @@
#define mod_64(x, y) ((x) % (y))
#endif
-#define PRId64 "d"
-#define PRIx64 "llx"
-#define PRIu64 "u"
-#define PRIo64 "o"
-
/* 14 is the version for Xeon and Pentium 8.4.8*/
#define APIC_VERSION 0x14UL
#define LAPIC_MMIO_LENGTH (1 << 12)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6eaa3d6994ae..ec169f5c7dce 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -58,6 +58,8 @@
extern bool itlb_multihit_kvm_mitigation;
+static bool nx_hugepage_mitigation_hard_disabled;
+
int __read_mostly nx_huge_pages = -1;
static uint __read_mostly nx_huge_pages_recovery_period_ms;
#ifdef CONFIG_PREEMPT_RT
@@ -67,12 +69,13 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
#endif
+static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp);
static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops nx_huge_pages_ops = {
.set = set_nx_huge_pages,
- .get = param_get_bool,
+ .get = get_nx_huge_pages,
};
static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
@@ -1600,6 +1603,10 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
if (tdp_mmu_enabled)
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+ if (kvm_x86_ops.set_apic_access_page_addr &&
+ range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+
return flush;
}
@@ -5797,6 +5804,14 @@ static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
vcpu_clear_mmio_info(vcpu, addr);
+ /*
+ * Walking and synchronizing SPTEs both assume they are operating in
+ * the context of the current MMU, and would need to be reworked if
+ * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
+ */
+ if (WARN_ON_ONCE(mmu != vcpu->arch.mmu))
+ return;
+
if (!VALID_PAGE(root_hpa))
return;
@@ -6844,6 +6859,14 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
+{
+ if (nx_hugepage_mitigation_hard_disabled)
+ return sprintf(buffer, "never\n");
+
+ return param_get_bool(buffer, kp);
+}
+
static bool get_nx_auto_mode(void)
{
/* Return true when CPU has the bug, and mitigations are ON */
@@ -6860,15 +6883,29 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
bool old_val = nx_huge_pages;
bool new_val;
+ if (nx_hugepage_mitigation_hard_disabled)
+ return -EPERM;
+
/* In "auto" mode deploy workaround only if CPU has the bug. */
- if (sysfs_streq(val, "off"))
+ if (sysfs_streq(val, "off")) {
new_val = 0;
- else if (sysfs_streq(val, "force"))
+ } else if (sysfs_streq(val, "force")) {
new_val = 1;
- else if (sysfs_streq(val, "auto"))
+ } else if (sysfs_streq(val, "auto")) {
new_val = get_nx_auto_mode();
- else if (kstrtobool(val, &new_val) < 0)
+ } else if (sysfs_streq(val, "never")) {
+ new_val = 0;
+
+ mutex_lock(&kvm_lock);
+ if (!list_empty(&vm_list)) {
+ mutex_unlock(&kvm_lock);
+ return -EBUSY;
+ }
+ nx_hugepage_mitigation_hard_disabled = true;
+ mutex_unlock(&kvm_lock);
+ } else if (kstrtobool(val, &new_val) < 0) {
return -EINVAL;
+ }
__set_nx_huge_pages(new_val);
@@ -7006,6 +7043,9 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
uint old_period, new_period;
int err;
+ if (nx_hugepage_mitigation_hard_disabled)
+ return -EPERM;
+
was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
err = param_set_uint(val, kp);
@@ -7164,6 +7204,9 @@ int kvm_mmu_post_init_vm(struct kvm *kvm)
{
int err;
+ if (nx_hugepage_mitigation_hard_disabled)
+ return 0;
+
err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
"kvm-nx-lpage-recovery",
&kvm->arch.nx_huge_page_recovery_thread);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 08340219c35a..512163d52194 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -592,7 +592,10 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
/*
* Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
- * does not hold the mmu_lock.
+ * does not hold the mmu_lock. On failure, i.e. if a different logical
+ * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with
+ * the current value, so the caller operates on fresh data, e.g. if it
+ * retries tdp_mmu_set_spte_atomic()
*/
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
return -EBUSY;
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 9fac1ec03463..3eb6e7f47e96 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -25,10 +25,24 @@
#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
+static bool is_mtrr_base_msr(unsigned int msr)
+{
+ /* MTRR base MSRs use even numbers, masks use odd numbers. */
+ return !(msr & 0x1);
+}
+
+static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu,
+ unsigned int msr)
+{
+ int index = (msr - MTRRphysBase_MSR(0)) / 2;
+
+ return &vcpu->arch.mtrr_state.var_ranges[index];
+}
+
static bool msr_mtrr_valid(unsigned msr)
{
switch (msr) {
- case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+ case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
case MSR_MTRRfix64K_00000:
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
@@ -41,7 +55,6 @@ static bool msr_mtrr_valid(unsigned msr)
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
case MSR_MTRRdefType:
- case MSR_IA32_CR_PAT:
return true;
}
return false;
@@ -52,7 +65,7 @@ static bool valid_mtrr_type(unsigned t)
return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
}
-bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
int i;
u64 mask;
@@ -60,9 +73,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!msr_mtrr_valid(msr))
return false;
- if (msr == MSR_IA32_CR_PAT) {
- return kvm_pat_valid(data);
- } else if (msr == MSR_MTRRdefType) {
+ if (msr == MSR_MTRRdefType) {
if (data & ~0xcff)
return false;
return valid_mtrr_type(data & 0xff);
@@ -74,7 +85,8 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
/* variable MTRRs */
- WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
+ WARN_ON(!(msr >= MTRRphysBase_MSR(0) &&
+ msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)));
mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
if ((msr & 1) == 0) {
@@ -88,7 +100,6 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return (data & mask) == 0;
}
-EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
{
@@ -308,10 +319,8 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
gfn_t start, end;
- int index;
- if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
- !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm))
return;
if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
@@ -326,8 +335,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
end = ~0ULL;
} else {
/* variable range MTRRs. */
- index = (msr - 0x200) / 2;
- var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end);
+ var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
}
kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
@@ -342,21 +350,18 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
struct kvm_mtrr_range *tmp, *cur;
- int index, is_mtrr_mask;
- index = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * index;
- cur = &mtrr_state->var_ranges[index];
+ cur = var_mtrr_msr_to_range(vcpu, msr);
/* remove the entry if it's in the list. */
if (var_mtrr_range_is_valid(cur))
- list_del(&mtrr_state->var_ranges[index].node);
+ list_del(&cur->node);
/*
* Set all illegal GPA bits in the mask, since those bits must
* implicitly be 0. The bits are then cleared when reading them.
*/
- if (!is_mtrr_mask)
+ if (is_mtrr_base_msr(msr))
cur->base = data;
else
cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
@@ -382,8 +387,6 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
*(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
else if (msr == MSR_MTRRdefType)
vcpu->arch.mtrr_state.deftype = data;
- else if (msr == MSR_IA32_CR_PAT)
- vcpu->arch.pat = data;
else
set_var_mtrr_msr(vcpu, msr, data);
@@ -411,21 +414,16 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 1;
index = fixed_msr_to_range_index(msr);
- if (index >= 0)
+ if (index >= 0) {
*pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
- else if (msr == MSR_MTRRdefType)
+ } else if (msr == MSR_MTRRdefType) {
*pdata = vcpu->arch.mtrr_state.deftype;
- else if (msr == MSR_IA32_CR_PAT)
- *pdata = vcpu->arch.pat;
- else { /* Variable MTRRs */
- int is_mtrr_mask;
-
- index = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * index;
- if (!is_mtrr_mask)
- *pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
+ } else {
+ /* Variable MTRRs */
+ if (is_mtrr_base_msr(msr))
+ *pdata = var_mtrr_msr_to_range(vcpu, msr)->base;
else
- *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+ *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask;
*pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 1690d41c1830..bf653df86112 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -93,11 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
#undef __KVM_X86_PMU_OP
}
-static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
-{
- return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
-}
-
static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
{
struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
@@ -562,6 +557,14 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
+ default:
+ break;
+ }
return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
}
@@ -577,13 +580,86 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u32 msr = msr_info->index;
+
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ msr_info->data = pmu->global_status;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ msr_info->data = pmu->global_ctrl;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ msr_info->data = 0;
+ break;
+ default:
+ return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
+ }
+
+ return 0;
}
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
- return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+ u64 diff;
+
+ /*
+ * Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
+ * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
+ */
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ if (!msr_info->host_initiated)
+ return 1; /* RO MSR */
+ fallthrough;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ /* Per PPR, Read-only MSR. Writes are ignored. */
+ if (!msr_info->host_initiated)
+ break;
+
+ if (data & pmu->global_status_mask)
+ return 1;
+
+ pmu->global_status = data;
+ break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ data &= ~pmu->global_ctrl_mask;
+ fallthrough;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (!kvm_valid_perf_global_ctrl(pmu, data))
+ return 1;
+
+ if (pmu->global_ctrl != data) {
+ diff = pmu->global_ctrl ^ data;
+ pmu->global_ctrl = data;
+ reprogram_counters(pmu, diff);
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ /*
+ * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
+ * GLOBAL_STATUS, and so the set of reserved bits is the same.
+ */
+ if (data & pmu->global_status_mask)
+ return 1;
+ fallthrough;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
+ break;
+ default:
+ kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
+ return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
+ }
+
+ return 0;
}
/* refresh PMU settings. This function generally is called when underlying
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 5c7bbf03b599..7d9ba301c090 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -20,7 +20,6 @@
struct kvm_pmu_ops {
bool (*hw_event_available)(struct kvm_pmc *pmc);
- bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask);
@@ -37,10 +36,25 @@ struct kvm_pmu_ops {
const u64 EVENTSEL_EVENT;
const int MAX_NR_GP_COUNTERS;
+ const int MIN_NR_GP_COUNTERS;
};
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
+static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu)
+{
+ /*
+ * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is
+ * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is
+ * greater than zero. However, KVM only exposes and emulates the MSR
+ * to/for the guest if the guest PMU supports at least "Architectural
+ * Performance Monitoring Version 2".
+ *
+ * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2.
+ */
+ return pmu->version > 1;
+}
+
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -161,6 +175,7 @@ extern struct x86_pmu_capability kvm_pmu_cap;
static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
{
bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
+ int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS;
/*
* Hybrid PMUs don't play nice with virtualization without careful
@@ -175,11 +190,15 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
perf_get_x86_pmu_capability(&kvm_pmu_cap);
/*
- * For Intel, only support guest architectural pmu
- * on a host with architectural pmu.
+ * WARN if perf did NOT disable hardware PMU if the number of
+ * architecturally required GP counters aren't present, i.e. if
+ * there are a non-zero number of counters, but fewer than what
+ * is architecturally required.
*/
- if ((is_intel && !kvm_pmu_cap.version) ||
- !kvm_pmu_cap.num_counters_gp)
+ if (!kvm_pmu_cap.num_counters_gp ||
+ WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs))
+ enable_pmu = false;
+ else if (is_intel && !kvm_pmu_cap.version)
enable_pmu = false;
}
@@ -201,6 +220,33 @@ static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
+static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
+{
+ int bit;
+
+ if (!diff)
+ return;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ set_bit(bit, pmu->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
+}
+
+/*
+ * Check if a PMC is enabled by comparing it against global_ctrl bits.
+ *
+ * If the vPMU doesn't have global_ctrl MSR, all vPMCs are enabled.
+ */
+static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!kvm_pmu_has_perf_global_ctrl(pmu))
+ return true;
+
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index a5717282bb9c..56cbdb24400a 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -15,6 +15,7 @@ enum kvm_only_cpuid_leafs {
CPUID_12_EAX = NCAPINTS,
CPUID_7_1_EDX,
CPUID_8000_0007_EDX,
+ CPUID_8000_0022_EAX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -47,6 +48,9 @@ enum kvm_only_cpuid_leafs {
/* CPUID level 0x80000007 (EDX). */
#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
+/* CPUID level 0x80000022 (EAX) */
+#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0)
+
struct cpuid_reg {
u32 function;
u32 index;
@@ -74,6 +78,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_7_1_EDX] = { 7, 1, CPUID_EDX},
[CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX},
[CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
+ [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
};
/*
@@ -108,6 +113,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
return KVM_X86_FEATURE_SGX_EDECCSSA;
else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
return KVM_X86_FEATURE_CONSTANT_TSC;
+ else if (x86_feature == X86_FEATURE_PERFMON_V2)
+ return KVM_X86_FEATURE_PERFMON_V2;
return x86_feature;
}
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 5fa939e411d8..cef5a3d0abd0 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -78,14 +78,6 @@ static bool amd_hw_event_available(struct kvm_pmc *pmc)
return true;
}
-/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
- * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
- */
-static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
-{
- return true;
-}
-
static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -102,12 +94,6 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30));
}
-static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
-{
- /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */
- return false;
-}
-
static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -119,6 +105,29 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
return pmc;
}
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ switch (msr) {
+ case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
+ return pmu->version > 0;
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE);
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ return pmu->version > 1;
+ default:
+ if (msr > MSR_F15H_PERF_CTR5 &&
+ msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters)
+ return pmu->version > 1;
+ break;
+ }
+
+ return amd_msr_idx_to_pmc(vcpu, msr);
+}
+
static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -172,20 +181,39 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ union cpuid_0x80000022_ebx ebx;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ pmu->version = 1;
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
+ pmu->version = 2;
+ /*
+ * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
+ * CPUID entry is guaranteed to be non-NULL.
+ */
+ BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
+ x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
+ ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
+ pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
+ } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
- else
+ } else {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+ }
+
+ pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters,
+ kvm_pmu_cap.num_counters_gp);
+
+ if (pmu->version > 1) {
+ pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1);
+ pmu->global_status_mask = pmu->global_ctrl_mask;
+ }
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
pmu->reserved_bits = 0xfffffff000280000ull;
pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
- pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->nr_arch_fixed_counters = 0;
- pmu->global_status = 0;
bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters);
}
@@ -216,11 +244,12 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
pmc_stop_counter(pmc);
pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
+
+ pmu->global_ctrl = pmu->global_status = 0;
}
struct kvm_pmu_ops amd_pmu_ops __initdata = {
.hw_event_available = amd_hw_event_available,
- .pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = amd_msr_idx_to_pmc,
@@ -233,4 +262,5 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
.reset = amd_pmu_reset,
.EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
.MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC,
+ .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS,
};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 69ae5e1b3120..07756b7348ae 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2216,10 +2216,7 @@ void __init sev_hardware_setup(void)
}
sev_asid_count = max_sev_asid - min_sev_asid + 1;
- if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count))
- goto out;
-
- pr_info("SEV supported: %u ASIDs\n", sev_asid_count);
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
sev_supported = true;
/* SEV-ES support requested? */
@@ -2244,13 +2241,19 @@ void __init sev_hardware_setup(void)
goto out;
sev_es_asid_count = min_sev_asid - 1;
- if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count))
- goto out;
-
- pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count);
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count));
sev_es_supported = true;
out:
+ if (boot_cpu_has(X86_FEATURE_SEV))
+ pr_info("SEV %s (ASIDs %u - %u)\n",
+ sev_supported ? "enabled" : "disabled",
+ min_sev_asid, max_sev_asid);
+ if (boot_cpu_has(X86_FEATURE_SEV_ES))
+ pr_info("SEV-ES %s (ASIDs %u - %u)\n",
+ sev_es_supported ? "enabled" : "disabled",
+ min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1);
+
sev_enabled = sev_supported;
sev_es_enabled = sev_es_supported;
#endif
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 54089f990c8f..d381ad424554 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -244,15 +244,6 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa";
static unsigned long iopm_base;
-struct kvm_ldttss_desc {
- u16 limit0;
- u16 base0;
- unsigned base1:8, type:5, dpl:2, p:1;
- unsigned limit1:4, zero0:3, g:1, base2:8;
- u32 base3;
- u32 zero1;
-} __attribute__((packed));
-
DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
/*
@@ -588,7 +579,6 @@ static int svm_hardware_enable(void)
struct svm_cpu_data *sd;
uint64_t efer;
- struct desc_struct *gdt;
int me = raw_smp_processor_id();
rdmsrl(MSR_EFER, efer);
@@ -601,9 +591,6 @@ static int svm_hardware_enable(void)
sd->next_asid = sd->max_asid + 1;
sd->min_asid = max_sev_asid + 1;
- gdt = get_current_gdt_rw();
- sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
-
wrmsrl(MSR_EFER, efer | EFER_SVME);
wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa);
@@ -752,7 +739,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
BUG_ON(offset == MSR_INVALID);
- return !!test_bit(bit_write, &tmp);
+ return test_bit(bit_write, &tmp);
}
static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
@@ -2939,9 +2926,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_CR_PAT:
- if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
- return 1;
- vcpu->arch.pat = data;
+ ret = kvm_set_msr_common(vcpu, msr);
+ if (ret)
+ break;
+
svm->vmcb01.ptr->save.g_pat = data;
if (is_guest_mode(vcpu))
nested_vmcb02_compute_g_pat(svm);
@@ -3418,8 +3406,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(vcpu, KVM_ISA_SVM);
-
/* SEV-ES guests must use the CR write traps to track CR registers. */
if (!sev_es_guest(vcpu->kvm)) {
if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
@@ -3457,14 +3443,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return svm_invoke_exit_handler(vcpu, exit_code);
}
-static void reload_tss(struct kvm_vcpu *vcpu)
-{
- struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
-
- sd->tss_desc->type = 9; /* available 32/64-bit TSS */
- load_TR_desc();
-}
-
static void pre_svm_run(struct kvm_vcpu *vcpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
@@ -4099,9 +4077,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted);
- if (!sev_es_guest(vcpu->kvm))
- reload_tss(vcpu);
-
if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
x86_spec_ctrl_restore_host(svm->virt_spec_ctrl);
@@ -4156,6 +4131,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(vcpu);
+ trace_kvm_exit(vcpu, KVM_ISA_SVM);
+
svm_complete_interrupts(vcpu);
if (is_guest_mode(vcpu))
@@ -5025,9 +5002,22 @@ static __init void svm_set_cpu_caps(void)
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
- /* AMD PMU PERFCTR_CORE CPUID */
- if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
- kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
+ if (enable_pmu) {
+ /*
+ * Enumerate support for PERFCTR_CORE if and only if KVM has
+ * access to enough counters to virtualize "core" support,
+ * otherwise limit vPMU support to the legacy number of counters.
+ */
+ if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE)
+ kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS,
+ kvm_pmu_cap.num_counters_gp);
+ else
+ kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE);
+
+ if (kvm_pmu_cap.version != 2 ||
+ !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE))
+ kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2);
+ }
/* CPUID 0x8000001F (SME/SEV features) */
sev_set_cpu_caps();
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f44751dd8d5d..18af7e712a5a 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -303,7 +303,6 @@ struct svm_cpu_data {
u32 max_asid;
u32 next_asid;
u32 min_asid;
- struct kvm_ldttss_desc *tss_desc;
struct page *save_area;
unsigned long save_area_pa;
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 45162c1bcd8f..d0abee35d7ba 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -152,8 +152,8 @@ static inline bool cpu_has_vmx_ept(void)
static inline bool vmx_umip_emulated(void)
{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_DESC;
+ return !boot_cpu_has(X86_FEATURE_UMIP) &&
+ (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_DESC);
}
static inline bool cpu_has_vmx_rdtscp(void)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index e35cf0bd0df9..516391cc0d64 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2328,8 +2328,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
* Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
* will not have to rewrite the controls just for this bit.
*/
- if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
- (vmcs12->guest_cr4 & X86_CR4_UMIP))
+ if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP))
exec_control |= SECONDARY_EXEC_DESC;
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
@@ -2649,7 +2648,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
- intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
+ kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
vmcs12->guest_ia32_perf_global_ctrl))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
@@ -4524,7 +4523,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vcpu->arch.pat = vmcs12->host_ia32_pat;
}
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
- intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
+ kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
vmcs12->host_ia32_perf_global_ctrl));
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 741efe2c497b..80c769c58a87 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -73,18 +73,6 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
}
}
-static void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
-{
- int bit;
-
- if (!diff)
- return;
-
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
- set_bit(bit, pmu->reprogram_pmi);
- kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
-}
-
static bool intel_hw_event_available(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -107,17 +95,6 @@ static bool intel_hw_event_available(struct kvm_pmc *pmc)
return true;
}
-/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
-static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = pmc_to_pmu(pmc);
-
- if (!intel_pmu_has_perf_global_ctrl(pmu))
- return true;
-
- return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
-}
-
static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -198,11 +175,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- case MSR_CORE_PERF_GLOBAL_STATUS:
- case MSR_CORE_PERF_GLOBAL_CTRL:
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- return intel_pmu_has_perf_global_ctrl(pmu);
- break;
+ return kvm_pmu_has_perf_global_ctrl(pmu);
case MSR_IA32_PEBS_ENABLE:
ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT;
break;
@@ -352,15 +325,6 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_FIXED_CTR_CTRL:
msr_info->data = pmu->fixed_ctr_ctrl;
break;
- case MSR_CORE_PERF_GLOBAL_STATUS:
- msr_info->data = pmu->global_status;
- break;
- case MSR_CORE_PERF_GLOBAL_CTRL:
- msr_info->data = pmu->global_ctrl;
- break;
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- msr_info->data = 0;
- break;
case MSR_IA32_PEBS_ENABLE:
msr_info->data = pmu->pebs_enable;
break;
@@ -410,29 +374,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (pmu->fixed_ctr_ctrl != data)
reprogram_fixed_counters(pmu, data);
break;
- case MSR_CORE_PERF_GLOBAL_STATUS:
- if (!msr_info->host_initiated)
- return 1; /* RO MSR */
-
- pmu->global_status = data;
- break;
- case MSR_CORE_PERF_GLOBAL_CTRL:
- if (!kvm_valid_perf_global_ctrl(pmu, data))
- return 1;
-
- if (pmu->global_ctrl != data) {
- diff = pmu->global_ctrl ^ data;
- pmu->global_ctrl = data;
- reprogram_counters(pmu, diff);
- }
- break;
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if (data & pmu->global_ovf_ctrl_mask)
- return 1;
-
- if (!msr_info->host_initiated)
- pmu->global_status &= ~data;
- break;
case MSR_IA32_PEBS_ENABLE:
if (data & pmu->pebs_enable_mask)
return 1;
@@ -444,8 +385,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
break;
case MSR_IA32_DS_AREA:
- if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS))
- return 1;
if (is_noncanonical_address(data, vcpu))
return 1;
@@ -531,7 +470,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->reserved_bits = 0xffffffff00200000ull;
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
pmu->global_ctrl_mask = ~0ull;
- pmu->global_ovf_ctrl_mask = ~0ull;
+ pmu->global_status_mask = ~0ull;
pmu->fixed_ctr_ctrl_mask = ~0ull;
pmu->pebs_enable_mask = ~0ull;
pmu->pebs_data_cfg_mask = ~0ull;
@@ -585,11 +524,17 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
pmu->global_ctrl_mask = counter_mask;
- pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
+
+ /*
+ * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET)
+ * share reserved bit definitions. The kernel just happens to use
+ * OVF_CTRL for the names.
+ */
+ pmu->global_status_mask = pmu->global_ctrl_mask
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
if (vmx_pt_mode_is_host_guest())
- pmu->global_ovf_ctrl_mask &=
+ pmu->global_status_mask &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
@@ -801,7 +746,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
pmc = intel_pmc_idx_to_pmc(pmu, bit);
if (!pmc || !pmc_speculative_in_use(pmc) ||
- !intel_pmc_is_enabled(pmc) || !pmc->perf_event)
+ !pmc_is_globally_enabled(pmc) || !pmc->perf_event)
continue;
/*
@@ -816,7 +761,6 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
struct kvm_pmu_ops intel_pmu_ops __initdata = {
.hw_event_available = intel_hw_event_available,
- .pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = intel_msr_idx_to_pmc,
@@ -831,4 +775,5 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
.cleanup = intel_pmu_cleanup,
.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
.MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
+ .MIN_NR_GP_COUNTERS = 1,
};
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 2261b684a7d4..3e822e582497 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -357,11 +357,12 @@ static int handle_encls_einit(struct kvm_vcpu *vcpu)
static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
{
- if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX))
- return false;
-
+ /*
+ * ENCLS generates a #UD if SGX1 isn't supported, i.e. this point will
+ * be reached if and only if the SGX1 leafs are enabled.
+ */
if (leaf >= ECREATE && leaf <= ETRACK)
- return guest_cpuid_has(vcpu, X86_FEATURE_SGX1);
+ return true;
if (leaf >= EAUG && leaf <= EMODT)
return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
@@ -380,9 +381,11 @@ int handle_encls(struct kvm_vcpu *vcpu)
{
u32 leaf = (u32)kvm_rax_read(vcpu);
- if (!encls_leaf_enabled_in_guest(vcpu, leaf)) {
+ if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+ !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
kvm_queue_exception(vcpu, UD_VECTOR);
- } else if (!sgx_enabled_in_guest_bios(vcpu)) {
+ } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) ||
+ !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) {
kvm_inject_gp(vcpu, 0);
} else {
if (leaf == ECREATE)
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 631fd7da2bc3..07e927d4d099 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -187,7 +187,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
_ASM_EXTABLE(.Lvmresume, .Lfixup)
_ASM_EXTABLE(.Lvmlaunch, .Lfixup)
-SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
+SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
/* Restore unwind state from before the VMRESUME/VMLAUNCH. */
UNWIND_HINT_RESTORE
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 44fb619803b8..0ecf4be2c6af 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2287,19 +2287,16 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
goto find_uret_msr;
case MSR_IA32_CR_PAT:
- if (!kvm_pat_valid(data))
- return 1;
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ if (ret)
+ break;
if (is_guest_mode(vcpu) &&
get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
get_vmcs12(vcpu)->guest_ia32_pat = data;
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, data);
- vcpu->arch.pat = data;
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
@@ -3387,15 +3384,15 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long old_cr4 = vcpu->arch.cr4;
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr4;
+
/*
* Pass through host's Machine Check Enable value to hw_cr4, which
* is in force while we are in guest mode. Do not let guests control
* this bit, even if host CR4.MCE == 0.
*/
- unsigned long hw_cr4;
-
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
if (is_unrestricted_guest(vcpu))
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
@@ -3404,7 +3401,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
else
hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
- if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
+ if (vmx_umip_emulated()) {
if (cr4 & X86_CR4_UMIP) {
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
hw_cr4 &= ~X86_CR4_UMIP;
@@ -5402,7 +5399,13 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
static int handle_desc(struct kvm_vcpu *vcpu)
{
- WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
+ /*
+ * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
+ * and other code needs to be updated if UMIP can be guest owned.
+ */
+ BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
+
+ WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
return kvm_emulate_instruction(vcpu, 0);
}
@@ -6708,7 +6711,12 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
- struct page *page;
+ const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *slot;
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
/* Defer reload until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
@@ -6720,18 +6728,53 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
return;
- page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page))
+ /*
+ * Grab the memslot so that the hva lookup for the mmu_notifier retry
+ * is guaranteed to use the same memslot as the pfn lookup, i.e. rely
+ * on the pfn lookup's validation of the memslot to ensure a valid hva
+ * is used for the retry check.
+ */
+ slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return;
- vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+ /*
+ * Ensure that the mmu_notifier sequence count is read before KVM
+ * retrieves the pfn from the primary MMU. Note, the memslot is
+ * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb()
+ * in kvm_mmu_invalidate_end().
+ */
+ mmu_seq = kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ /*
+ * No need to retry if the memslot does not exist or is invalid. KVM
+ * controls the APIC-access page memslot, and only deletes the memslot
+ * if APICv is permanently inhibited, i.e. the memslot won't reappear.
+ */
+ pfn = gfn_to_pfn_memslot(slot, gfn);
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ read_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_invalidate_retry_hva(kvm, mmu_seq,
+ gfn_to_hva_memslot(slot, gfn))) {
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ read_unlock(&vcpu->kvm->mmu_lock);
+ goto out;
+ }
+
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
+ read_unlock(&vcpu->kvm->mmu_lock);
+
vmx_flush_tlb_current(vcpu);
+out:
/*
* Do not pin apic access page in memory, the MMU notifier
* will call us again if it is migrated or swapped out.
*/
- put_page(page);
+ kvm_release_pfn_clean(pfn);
}
static void vmx_hwapic_isr_update(int max_isr)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9e66531861cf..32384ba38499 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -93,18 +93,6 @@ union vmx_exit_reason {
u32 full;
};
-static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu)
-{
- /*
- * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is
- * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is
- * greater than zero. However, KVM only exposes and emulates the MSR
- * to/for the guest if the guest PMU supports at least "Architectural
- * Performance Monitoring Version 2".
- */
- return pmu->version > 1;
-}
-
struct lbr_desc {
/* Basic info about guest LBR records. */
struct x86_pmu_lbr records;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7f70207e8689..a6b9bea62fb8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1017,13 +1017,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
- if (static_cpu_has(X86_FEATURE_PKU) &&
+ if (cpu_feature_enabled(X86_FEATURE_PKU) &&
vcpu->arch.pkru != vcpu->arch.host_pkru &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
write_pkru(vcpu->arch.pkru);
-#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
}
EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
@@ -1032,15 +1030,13 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return;
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
- if (static_cpu_has(X86_FEATURE_PKU) &&
+ if (cpu_feature_enabled(X86_FEATURE_PKU) &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
write_pkru(vcpu->arch.host_pkru);
}
-#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
@@ -1427,15 +1423,14 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
/*
- * List of msr numbers which we expose to userspace through KVM_GET_MSRS
- * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
- *
- * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
- * extract the supported MSRs from the related const lists.
- * msrs_to_save is selected from the msrs_to_save_all to reflect the
- * capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
- * may depend on host virtualization features rather than host cpu features.
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
+ * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
+ * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
+ * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
+ * MSRs that KVM emulates without strictly requiring host support.
+ * msr_based_features holds MSRs that enumerate features, i.e. are effectively
+ * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
+ * msrs_to_save and emulated_msrs.
*/
static const u32 msrs_to_save_base[] = {
@@ -1483,6 +1478,10 @@ static const u32 msrs_to_save_pmu[] = {
MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+
+ MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
};
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
@@ -1531,11 +1530,11 @@ static const u32 emulated_msrs_all[] = {
MSR_IA32_UCODE_REV,
/*
- * The following list leaves out MSRs whose values are determined
- * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
- * We always support the "true" VMX control MSRs, even if the host
- * processor does not, so I am putting these registers here rather
- * than in msrs_to_save_all.
+ * KVM always supports the "true" VMX control MSRs, even if the host
+ * does not. The VMX MSRs as a whole are considered "emulated" as KVM
+ * doesn't strictly require them to exist in the host (ignoring that
+ * KVM would refuse to load in the first place if the core set of MSRs
+ * aren't supported).
*/
MSR_IA32_VMX_BASIC,
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
@@ -1631,7 +1630,7 @@ static u64 kvm_get_arch_capabilities(void)
* If we're doing cache flushes (either "always" or "cond")
* we will do one whenever the guest does a vmlaunch/vmresume.
* If an outer hypervisor is doing the cache flush for us
- * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
+ * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that
* capability to the guest too, and if EPT is disabled we're not
* vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
* require a nested hypervisor to do a flush of its own.
@@ -1809,7 +1808,7 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
unsigned long *bitmap = ranges[i].bitmap;
if ((index >= start) && (index < end) && (flags & type)) {
- allowed = !!test_bit(index - start, bitmap);
+ allowed = test_bit(index - start, bitmap);
break;
}
}
@@ -3701,8 +3700,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
break;
- case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
- case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
+ case MSR_IA32_CR_PAT:
+ if (!kvm_pat_valid(data))
+ return 1;
+
+ vcpu->arch.pat = data;
+ break;
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
return kvm_set_apic_base(vcpu, msr_info);
@@ -4109,9 +4114,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
break;
}
+ case MSR_IA32_CR_PAT:
+ msr_info->data = vcpu->arch.pat;
+ break;
case MSR_MTRRcap:
- case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
- case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
case 0xcd: /* fsb frequency */
msr_info->data = 3;
@@ -7149,6 +7157,12 @@ static void kvm_probe_msr_to_save(u32 msr_index)
kvm_pmu_cap.num_counters_fixed)
return;
break;
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
+ return;
+ break;
case MSR_IA32_XFD:
case MSR_IA32_XFD_ERR:
if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
@@ -10434,20 +10448,6 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
}
-void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end)
-{
- unsigned long apic_address;
-
- /*
- * The physical address of apic access page is stored in the VMCS.
- * Update it when it becomes invalid.
- */
- apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (start <= apic_address && apic_address < end)
- kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-}
-
void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c544602d07a3..82e3dafc5453 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -309,7 +309,6 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
-bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig
index 80c4e5101c97..8a03af5ee5b3 100644
--- a/drivers/s390/char/Kconfig
+++ b/drivers/s390/char/Kconfig
@@ -96,7 +96,7 @@ config SCLP_OFB
config S390_UV_UAPI
def_tristate m
prompt "Ultravisor userspace API"
- depends on S390
+ depends on S390 && (KVM || PROTECTED_VIRTUALIZATION_GUEST)
help
Selecting exposes parts of the UV interface to userspace
by providing a misc character device at /dev/uv.
diff --git a/drivers/s390/char/uvdevice.c b/drivers/s390/char/uvdevice.c
index 1d40457c7b10..144cd2e03590 100644
--- a/drivers/s390/char/uvdevice.c
+++ b/drivers/s390/char/uvdevice.c
@@ -32,6 +32,55 @@
#include <asm/uvdevice.h>
#include <asm/uv.h>
+#define BIT_UVIO_INTERNAL U32_MAX
+/* Mapping from IOCTL-nr to UVC-bit */
+static const u32 ioctl_nr_to_uvc_bit[] __initconst = {
+ [UVIO_IOCTL_UVDEV_INFO_NR] = BIT_UVIO_INTERNAL,
+ [UVIO_IOCTL_ATT_NR] = BIT_UVC_CMD_RETR_ATTEST,
+ [UVIO_IOCTL_ADD_SECRET_NR] = BIT_UVC_CMD_ADD_SECRET,
+ [UVIO_IOCTL_LIST_SECRETS_NR] = BIT_UVC_CMD_LIST_SECRETS,
+ [UVIO_IOCTL_LOCK_SECRETS_NR] = BIT_UVC_CMD_LOCK_SECRETS,
+};
+
+static_assert(ARRAY_SIZE(ioctl_nr_to_uvc_bit) == UVIO_IOCTL_NUM_IOCTLS);
+
+static struct uvio_uvdev_info uvdev_info = {
+ .supp_uvio_cmds = GENMASK_ULL(UVIO_IOCTL_NUM_IOCTLS - 1, 0),
+};
+
+static void __init set_supp_uv_cmds(unsigned long *supp_uv_cmds)
+{
+ int i;
+
+ for (i = 0; i < UVIO_IOCTL_NUM_IOCTLS; i++) {
+ if (ioctl_nr_to_uvc_bit[i] == BIT_UVIO_INTERNAL)
+ continue;
+ if (!test_bit_inv(ioctl_nr_to_uvc_bit[i], uv_info.inst_calls_list))
+ continue;
+ __set_bit(i, supp_uv_cmds);
+ }
+}
+
+/**
+ * uvio_uvdev_info() - get information about the uvdevice
+ *
+ * @uv_ioctl: ioctl control block
+ *
+ * Lists all IOCTLs that are supported by this uvdevice
+ */
+static int uvio_uvdev_info(struct uvio_ioctl_cb *uv_ioctl)
+{
+ void __user *user_buf_arg = (void __user *)uv_ioctl->argument_addr;
+
+ if (uv_ioctl->argument_len < sizeof(uvdev_info))
+ return -EINVAL;
+ if (copy_to_user(user_buf_arg, &uvdev_info, sizeof(uvdev_info)))
+ return -EFAULT;
+
+ uv_ioctl->uv_rc = UVC_RC_EXECUTED;
+ return 0;
+}
+
static int uvio_build_uvcb_attest(struct uv_cb_attest *uvcb_attest, u8 *arcb,
u8 *meas, u8 *add_data, struct uvio_attest *uvio_attest)
{
@@ -185,8 +234,161 @@ out:
return ret;
}
-static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *argp)
+/** uvio_add_secret() - perform an Add Secret UVC
+ *
+ * @uv_ioctl: ioctl control block
+ *
+ * uvio_add_secret() performs the Add Secret Ultravisor Call.
+ *
+ * The given userspace argument address and size are verified to be
+ * valid but every other check is made by the Ultravisor
+ * (UV). Therefore UV errors won't result in a negative return
+ * value. The request is then copied to kernelspace, the UV-call is
+ * performed and the results are copied back to userspace.
+ *
+ * The argument has to point to an Add Secret Request Control Block
+ * which is an encrypted and cryptographically verified request that
+ * inserts a protected guest's secrets into the Ultravisor for later
+ * use.
+ *
+ * If the Add Secret UV facility is not present, UV will return
+ * invalid command rc. This won't be fenced in the driver and does not
+ * result in a negative return value.
+ *
+ * Context: might sleep
+ *
+ * Return: 0 on success or a negative error code on error.
+ */
+static int uvio_add_secret(struct uvio_ioctl_cb *uv_ioctl)
{
+ void __user *user_buf_arg = (void __user *)uv_ioctl->argument_addr;
+ struct uv_cb_guest_addr uvcb = {
+ .header.len = sizeof(uvcb),
+ .header.cmd = UVC_CMD_ADD_SECRET,
+ };
+ void *asrcb = NULL;
+ int ret;
+
+ if (uv_ioctl->argument_len > UVIO_ADD_SECRET_MAX_LEN)
+ return -EINVAL;
+ if (uv_ioctl->argument_len == 0)
+ return -EINVAL;
+
+ asrcb = kvzalloc(uv_ioctl->argument_len, GFP_KERNEL);
+ if (!asrcb)
+ return -ENOMEM;
+
+ ret = -EFAULT;
+ if (copy_from_user(asrcb, user_buf_arg, uv_ioctl->argument_len))
+ goto out;
+
+ ret = 0;
+ uvcb.addr = (u64)asrcb;
+ uv_call_sched(0, (u64)&uvcb);
+ uv_ioctl->uv_rc = uvcb.header.rc;
+ uv_ioctl->uv_rrc = uvcb.header.rrc;
+
+out:
+ kvfree(asrcb);
+ return ret;
+}
+
+/** uvio_list_secrets() - perform a List Secret UVC
+ * @uv_ioctl: ioctl control block
+ *
+ * uvio_list_secrets() performs the List Secret Ultravisor Call. It verifies
+ * that the given userspace argument address is valid and its size is sane.
+ * Every other check is made by the Ultravisor (UV) and won't result in a
+ * negative return value. It builds the request, performs the UV-call, and
+ * copies the result to userspace.
+ *
+ * The argument specifies the location for the result of the UV-Call.
+ *
+ * If the List Secrets UV facility is not present, UV will return invalid
+ * command rc. This won't be fenced in the driver and does not result in a
+ * negative return value.
+ *
+ * Context: might sleep
+ *
+ * Return: 0 on success or a negative error code on error.
+ */
+static int uvio_list_secrets(struct uvio_ioctl_cb *uv_ioctl)
+{
+ void __user *user_buf_arg = (void __user *)uv_ioctl->argument_addr;
+ struct uv_cb_guest_addr uvcb = {
+ .header.len = sizeof(uvcb),
+ .header.cmd = UVC_CMD_LIST_SECRETS,
+ };
+ void *secrets = NULL;
+ int ret = 0;
+
+ if (uv_ioctl->argument_len != UVIO_LIST_SECRETS_LEN)
+ return -EINVAL;
+
+ secrets = kvzalloc(UVIO_LIST_SECRETS_LEN, GFP_KERNEL);
+ if (!secrets)
+ return -ENOMEM;
+
+ uvcb.addr = (u64)secrets;
+ uv_call_sched(0, (u64)&uvcb);
+ uv_ioctl->uv_rc = uvcb.header.rc;
+ uv_ioctl->uv_rrc = uvcb.header.rrc;
+
+ if (copy_to_user(user_buf_arg, secrets, UVIO_LIST_SECRETS_LEN))
+ ret = -EFAULT;
+
+ kvfree(secrets);
+ return ret;
+}
+
+/** uvio_lock_secrets() - perform a Lock Secret Store UVC
+ * @uv_ioctl: ioctl control block
+ *
+ * uvio_lock_secrets() performs the Lock Secret Store Ultravisor Call. It
+ * performs the UV-call and copies the return codes to the ioctl control block.
+ * After this call was dispatched successfully every following Add Secret UVC
+ * and Lock Secrets UVC will fail with return code 0x102.
+ *
+ * The argument address and size must be 0.
+ *
+ * If the Lock Secrets UV facility is not present, UV will return invalid
+ * command rc. This won't be fenced in the driver and does not result in a
+ * negative return value.
+ *
+ * Context: might sleep
+ *
+ * Return: 0 on success or a negative error code on error.
+ */
+static int uvio_lock_secrets(struct uvio_ioctl_cb *ioctl)
+{
+ struct uv_cb_nodata uvcb = {
+ .header.len = sizeof(uvcb),
+ .header.cmd = UVC_CMD_LOCK_SECRETS,
+ };
+
+ if (ioctl->argument_addr || ioctl->argument_len)
+ return -EINVAL;
+
+ uv_call(0, (u64)&uvcb);
+ ioctl->uv_rc = uvcb.header.rc;
+ ioctl->uv_rrc = uvcb.header.rrc;
+
+ return 0;
+}
+
+static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *argp,
+ unsigned long cmd)
+{
+ u8 nr = _IOC_NR(cmd);
+
+ if (_IOC_DIR(cmd) != (_IOC_READ | _IOC_WRITE))
+ return -ENOIOCTLCMD;
+ if (_IOC_TYPE(cmd) != UVIO_TYPE_UVC)
+ return -ENOIOCTLCMD;
+ if (nr >= UVIO_IOCTL_NUM_IOCTLS)
+ return -ENOIOCTLCMD;
+ if (_IOC_SIZE(cmd) != sizeof(*ioctl))
+ return -ENOIOCTLCMD;
if (copy_from_user(ioctl, argp, sizeof(*ioctl)))
return -EFAULT;
if (ioctl->flags != 0)
@@ -194,7 +396,7 @@ static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *a
if (memchr_inv(ioctl->reserved14, 0, sizeof(ioctl->reserved14)))
return -EINVAL;
- return 0;
+ return nr;
}
/*
@@ -205,14 +407,28 @@ static long uvio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
void __user *argp = (void __user *)arg;
struct uvio_ioctl_cb uv_ioctl = { };
long ret;
+ int nr;
- switch (cmd) {
- case UVIO_IOCTL_ATT:
- ret = uvio_copy_and_check_ioctl(&uv_ioctl, argp);
- if (ret)
- return ret;
+ nr = uvio_copy_and_check_ioctl(&uv_ioctl, argp, cmd);
+ if (nr < 0)
+ return nr;
+
+ switch (nr) {
+ case UVIO_IOCTL_UVDEV_INFO_NR:
+ ret = uvio_uvdev_info(&uv_ioctl);
+ break;
+ case UVIO_IOCTL_ATT_NR:
ret = uvio_attestation(&uv_ioctl);
break;
+ case UVIO_IOCTL_ADD_SECRET_NR:
+ ret = uvio_add_secret(&uv_ioctl);
+ break;
+ case UVIO_IOCTL_LIST_SECRETS_NR:
+ ret = uvio_list_secrets(&uv_ioctl);
+ break;
+ case UVIO_IOCTL_LOCK_SECRETS_NR:
+ ret = uvio_lock_secrets(&uv_ioctl);
+ break;
default:
ret = -ENOIOCTLCMD;
break;
@@ -245,6 +461,7 @@ static void __exit uvio_dev_exit(void)
static int __init uvio_dev_init(void)
{
+ set_supp_uv_cmds((unsigned long *)&uvdev_info.supp_uv_cmds);
return misc_register(&uvio_dev_miscdev);
}
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 1a6a695ca67a..847da6fc2713 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -92,8 +92,12 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
/*
* Evaluates as true when emulating PMUv3p5, and false otherwise.
*/
-#define kvm_pmu_is_3p5(vcpu) \
- (vcpu->kvm->arch.dfr0_pmuver.imp >= ID_AA64DFR0_EL1_PMUVer_V3P5)
+#define kvm_pmu_is_3p5(vcpu) ({ \
+ u64 val = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); \
+ u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); \
+ \
+ pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5; \
+})
u8 kvm_arm_pmu_get_pmuver_limit(void);
diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
index d75fc4365746..56619e33251e 100644
--- a/include/kvm/iodev.h
+++ b/include/kvm/iodev.h
@@ -55,10 +55,4 @@ static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
: -EOPNOTSUPP;
}
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
- if (dev->ops->destructor)
- dev->ops->destructor(dev);
-}
-
#endif /* __KVM_IODEV_H__ */
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 583fe3b49a49..cc060da51bec 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -94,6 +94,14 @@
*/
#define FFA_PAGE_SIZE SZ_4K
+/*
+ * Minimum buffer size/alignment encodings returned by an FFA_FEATURES
+ * query for FFA_RXTX_MAP.
+ */
+#define FFA_FEAT_RXTX_MIN_SZ_4K 0
+#define FFA_FEAT_RXTX_MIN_SZ_64K 1
+#define FFA_FEAT_RXTX_MIN_SZ_16K 2
+
/* FFA Bus/Device/Driver related */
struct ffa_device {
u32 id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0e571e973bc2..9d3ac7720da9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -849,7 +849,7 @@ static inline void kvm_vm_bugged(struct kvm *kvm)
#define KVM_BUG(cond, kvm, fmt...) \
({ \
- int __ret = (cond); \
+ bool __ret = !!(cond); \
\
if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \
kvm_vm_bugged(kvm); \
@@ -858,7 +858,7 @@ static inline void kvm_vm_bugged(struct kvm *kvm)
#define KVM_BUG_ON(cond, kvm) \
({ \
- int __ret = (cond); \
+ bool __ret = !!(cond); \
\
if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \
kvm_vm_bugged(kvm); \
@@ -991,6 +991,8 @@ static inline bool kvm_memslots_empty(struct kvm_memslots *slots)
return RB_EMPTY_ROOT(&slots->gfn_tree);
}
+bool kvm_are_all_memslots_empty(struct kvm *kvm);
+
#define kvm_for_each_memslot(memslot, bkt, slots) \
hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \
if (WARN_ON_ONCE(!memslot->npages)) { \
@@ -2237,9 +2239,6 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
}
#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
-void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end);
-
void kvm_arch_guest_memory_reclaimed(struct kvm *kvm);
#ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 737318b1c1d9..f089ab290978 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1190,6 +1190,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
#define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
#define KVM_CAP_COUNTER_OFFSET 227
+#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
+#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1442,6 +1444,8 @@ enum kvm_device_type {
#define KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_XIVE
KVM_DEV_TYPE_ARM_PV_TIME,
#define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME
+ KVM_DEV_TYPE_RISCV_AIA,
+#define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA
KVM_DEV_TYPE_MAX,
};
@@ -1613,7 +1617,7 @@ struct kvm_s390_ucas_mapping {
#define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs)
#define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs)
/*
- * vcpu version available with KVM_ENABLE_CAP
+ * vcpu version available with KVM_CAP_ENABLE_CAP
* vm version available with KVM_CAP_ENABLE_CAP_VM
*/
#define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap)
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 4761b768b773..c692cc86e7da 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -61,6 +61,7 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
# Compiled test targets
TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
@@ -164,6 +165,7 @@ TEST_GEN_PROGS_s390x = s390x/memop
TEST_GEN_PROGS_s390x += s390x/resets
TEST_GEN_PROGS_s390x += s390x/sync_regs_test
TEST_GEN_PROGS_s390x += s390x/tprot
+TEST_GEN_PROGS_s390x += s390x/cmma_test
TEST_GEN_PROGS_s390x += demand_paging_test
TEST_GEN_PROGS_s390x += dirty_log_test
TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
@@ -184,6 +186,8 @@ TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH_DIR))
TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH_DIR))
LIBKVM += $(LIBKVM_$(ARCH_DIR))
+OVERRIDE_TARGETS = 1
+
# lib.mak defines $(OUTPUT), prepends $(OUTPUT)/ to $(TEST_GEN_PROGS), and most
# importantly defines, i.e. overwrites, $(CC) (unless `make -e` or `make CC=`,
# which causes the environment variable to override the makefile).
@@ -198,7 +202,7 @@ else
LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
endif
CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
- -Wno-gnu-variable-sized-type-not-at-end \
+ -Wno-gnu-variable-sized-type-not-at-end -MD\
-fno-builtin-memcmp -fno-builtin-memcpy -fno-builtin-memset \
-fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
-I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
@@ -225,7 +229,18 @@ LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
-EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.*
+TEST_GEN_OBJ = $(patsubst %, %.o, $(TEST_GEN_PROGS))
+TEST_GEN_OBJ += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED))
+TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_OBJ))
+TEST_DEP_FILES += $(patsubst %.o, %.d, $(LIBKVM_OBJS))
+-include $(TEST_DEP_FILES)
+
+$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): %: %.o
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBKVM_OBJS) $(LDLIBS) -o $@
+$(TEST_GEN_OBJ): $(OUTPUT)/%.o: %.c
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+EXTRA_CLEAN += $(LIBKVM_OBJS) $(TEST_DEP_FILES) $(TEST_GEN_OBJ) cscope.*
x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 2439c4043fed..09c116a82a84 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -128,6 +128,7 @@ static void prefault_mem(void *alias, uint64_t len)
static void run_test(enum vm_guest_mode mode, void *arg)
{
+ struct memstress_vcpu_args *vcpu_args;
struct test_params *p = arg;
struct uffd_desc **uffd_descs = NULL;
struct timespec start;
@@ -145,24 +146,24 @@ static void run_test(enum vm_guest_mode mode, void *arg)
"Failed to allocate buffer for guest data pattern");
memset(guest_data_prototype, 0xAB, demand_paging_size);
+ if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) {
+ for (i = 0; i < nr_vcpus; i++) {
+ vcpu_args = &memstress_args.vcpu_args[i];
+ prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa),
+ vcpu_args->pages * memstress_args.guest_page_size);
+ }
+ }
+
if (p->uffd_mode) {
uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *));
TEST_ASSERT(uffd_descs, "Memory allocation failed");
-
for (i = 0; i < nr_vcpus; i++) {
- struct memstress_vcpu_args *vcpu_args;
void *vcpu_hva;
- void *vcpu_alias;
vcpu_args = &memstress_args.vcpu_args[i];
/* Cache the host addresses of the region */
vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa);
- vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa);
-
- prefault_mem(vcpu_alias,
- vcpu_args->pages * memstress_args.guest_page_size);
-
/*
* Set up user fault fd to handle demand paging
* requests.
@@ -207,10 +208,11 @@ static void help(char *name)
{
puts("");
printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n"
- " [-b memory] [-s type] [-v vcpus] [-o]\n", name);
+ " [-b memory] [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name);
guest_modes_help();
printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n"
" UFFD registration mode: 'MISSING' or 'MINOR'.\n");
+ kvm_print_vcpu_pinning_help();
printf(" -d: add a delay in usec to the User Fault\n"
" FD handler to simulate demand paging\n"
" overheads. Ignored without -u.\n");
@@ -228,6 +230,7 @@ static void help(char *name)
int main(int argc, char *argv[])
{
int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+ const char *cpulist = NULL;
struct test_params p = {
.src_type = DEFAULT_VM_MEM_SRC,
.partition_vcpu_memory_access = true,
@@ -236,7 +239,7 @@ int main(int argc, char *argv[])
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:o")) != -1) {
+ while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:c:o")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
@@ -263,6 +266,9 @@ int main(int argc, char *argv[])
TEST_ASSERT(nr_vcpus <= max_vcpus,
"Invalid number of vcpus, must be between 1 and %d", max_vcpus);
break;
+ case 'c':
+ cpulist = optarg;
+ break;
case 'o':
p.partition_vcpu_memory_access = false;
break;
@@ -278,6 +284,12 @@ int main(int argc, char *argv[])
TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -s");
}
+ if (cpulist) {
+ kvm_parse_vcpu_pinning(cpulist, memstress_args.vcpu_to_pcpu,
+ nr_vcpus);
+ memstress_args.pin_vcpus = true;
+ }
+
for_each_guest_mode(run_test, &p);
return 0;
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
index e9d6d1aecf89..d374dbcf9a53 100644
--- a/tools/testing/selftests/kvm/dirty_log_perf_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -136,77 +136,6 @@ struct test_params {
bool random_access;
};
-static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable)
-{
- int i;
-
- for (i = 0; i < slots; i++) {
- int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
- int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0;
-
- vm_mem_region_set_flags(vm, slot, flags);
- }
-}
-
-static inline void enable_dirty_logging(struct kvm_vm *vm, int slots)
-{
- toggle_dirty_logging(vm, slots, true);
-}
-
-static inline void disable_dirty_logging(struct kvm_vm *vm, int slots)
-{
- toggle_dirty_logging(vm, slots, false);
-}
-
-static void get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots)
-{
- int i;
-
- for (i = 0; i < slots; i++) {
- int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
-
- kvm_vm_get_dirty_log(vm, slot, bitmaps[i]);
- }
-}
-
-static void clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[],
- int slots, uint64_t pages_per_slot)
-{
- int i;
-
- for (i = 0; i < slots; i++) {
- int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
-
- kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot);
- }
-}
-
-static unsigned long **alloc_bitmaps(int slots, uint64_t pages_per_slot)
-{
- unsigned long **bitmaps;
- int i;
-
- bitmaps = malloc(slots * sizeof(bitmaps[0]));
- TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array.");
-
- for (i = 0; i < slots; i++) {
- bitmaps[i] = bitmap_zalloc(pages_per_slot);
- TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap.");
- }
-
- return bitmaps;
-}
-
-static void free_bitmaps(unsigned long *bitmaps[], int slots)
-{
- int i;
-
- for (i = 0; i < slots; i++)
- free(bitmaps[i]);
-
- free(bitmaps);
-}
-
static void run_test(enum vm_guest_mode mode, void *arg)
{
struct test_params *p = arg;
@@ -236,7 +165,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
host_num_pages = vm_num_host_pages(mode, guest_num_pages);
pages_per_slot = host_num_pages / p->slots;
- bitmaps = alloc_bitmaps(p->slots, pages_per_slot);
+ bitmaps = memstress_alloc_bitmaps(p->slots, pages_per_slot);
if (dirty_log_manual_caps)
vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
@@ -277,7 +206,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
/* Enable dirty logging */
clock_gettime(CLOCK_MONOTONIC, &start);
- enable_dirty_logging(vm, p->slots);
+ memstress_enable_dirty_logging(vm, p->slots);
ts_diff = timespec_elapsed(start);
pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
@@ -306,7 +235,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
clock_gettime(CLOCK_MONOTONIC, &start);
- get_dirty_log(vm, bitmaps, p->slots);
+ memstress_get_dirty_log(vm, bitmaps, p->slots);
ts_diff = timespec_elapsed(start);
get_dirty_log_total = timespec_add(get_dirty_log_total,
ts_diff);
@@ -315,7 +244,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
if (dirty_log_manual_caps) {
clock_gettime(CLOCK_MONOTONIC, &start);
- clear_dirty_log(vm, bitmaps, p->slots, pages_per_slot);
+ memstress_clear_dirty_log(vm, bitmaps, p->slots,
+ pages_per_slot);
ts_diff = timespec_elapsed(start);
clear_dirty_log_total = timespec_add(clear_dirty_log_total,
ts_diff);
@@ -334,7 +264,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
/* Disable dirty logging */
clock_gettime(CLOCK_MONOTONIC, &start);
- disable_dirty_logging(vm, p->slots);
+ memstress_disable_dirty_logging(vm, p->slots);
ts_diff = timespec_elapsed(start);
pr_info("Disabling dirty logging time: %ld.%.9lds\n",
ts_diff.tv_sec, ts_diff.tv_nsec);
@@ -359,7 +289,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
}
- free_bitmaps(bitmaps, p->slots);
+ memstress_free_bitmaps(bitmaps, p->slots);
arch_cleanup_vm(vm);
memstress_destroy_vm(vm);
}
@@ -402,17 +332,7 @@ static void help(char *name)
" so -w X means each page has an X%% chance of writing\n"
" and a (100-X)%% chance of reading.\n"
" (default: 100 i.e. all pages are written to.)\n");
- printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n"
- " values (target pCPU), one for each vCPU, plus an optional\n"
- " entry for the main application task (specified via entry\n"
- " <nr_vcpus + 1>). If used, entries must be provided for all\n"
- " vCPUs, i.e. pinning vCPUs is all or nothing.\n\n"
- " E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n"
- " vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n"
- " ./dirty_log_perf_test -v 3 -c 22,23,24,50\n\n"
- " To leave the application task unpinned, drop the final entry:\n\n"
- " ./dirty_log_perf_test -v 3 -c 22,23,24\n\n"
- " (default: no pinning)\n");
+ kvm_print_vcpu_pinning_help();
puts("");
exit(0);
}
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index a089c356f354..07732a157ccd 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -733,6 +733,7 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm);
void kvm_pin_this_task_to_pcpu(uint32_t pcpu);
+void kvm_print_vcpu_pinning_help(void);
void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
int nr_vcpus);
diff --git a/tools/testing/selftests/kvm/include/memstress.h b/tools/testing/selftests/kvm/include/memstress.h
index 72e3e358ef7b..ce4e603050ea 100644
--- a/tools/testing/selftests/kvm/include/memstress.h
+++ b/tools/testing/selftests/kvm/include/memstress.h
@@ -72,4 +72,12 @@ void memstress_guest_code(uint32_t vcpu_id);
uint64_t memstress_nested_pages(int nr_vcpus);
void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]);
+void memstress_enable_dirty_logging(struct kvm_vm *vm, int slots);
+void memstress_disable_dirty_logging(struct kvm_vm *vm, int slots);
+void memstress_get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots);
+void memstress_clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[],
+ int slots, uint64_t pages_per_slot);
+unsigned long **memstress_alloc_bitmaps(int slots, uint64_t pages_per_slot);
+void memstress_free_bitmaps(unsigned long *bitmaps[], int slots);
+
#endif /* SELFTEST_KVM_MEMSTRESS_H */
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 298c4372fb1a..9741a7ff6380 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -494,6 +494,23 @@ static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask)
return pcpu;
}
+void kvm_print_vcpu_pinning_help(void)
+{
+ const char *name = program_invocation_name;
+
+ printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n"
+ " values (target pCPU), one for each vCPU, plus an optional\n"
+ " entry for the main application task (specified via entry\n"
+ " <nr_vcpus + 1>). If used, entries must be provided for all\n"
+ " vCPUs, i.e. pinning vCPUs is all or nothing.\n\n"
+ " E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n"
+ " vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n"
+ " %s -v 3 -c 22,23,24,50\n\n"
+ " To leave the application task unpinned, drop the final entry:\n\n"
+ " %s -v 3 -c 22,23,24\n\n"
+ " (default: no pinning)\n", name, name);
+}
+
void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
int nr_vcpus)
{
diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c
index 5f1d3173c238..df457452d146 100644
--- a/tools/testing/selftests/kvm/lib/memstress.c
+++ b/tools/testing/selftests/kvm/lib/memstress.c
@@ -5,6 +5,7 @@
#define _GNU_SOURCE
#include <inttypes.h>
+#include <linux/bitmap.h>
#include "kvm_util.h"
#include "memstress.h"
@@ -64,6 +65,9 @@ void memstress_guest_code(uint32_t vcpu_idx)
GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx);
while (true) {
+ for (i = 0; i < sizeof(memstress_args); i += args->guest_page_size)
+ (void) *((volatile char *)args + i);
+
for (i = 0; i < pages; i++) {
if (args->random_access)
page = guest_random_u32(&rand_state) % pages;
@@ -320,3 +324,74 @@ void memstress_join_vcpu_threads(int nr_vcpus)
for (i = 0; i < nr_vcpus; i++)
pthread_join(vcpu_threads[i].thread, NULL);
}
+
+static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable)
+{
+ int i;
+
+ for (i = 0; i < slots; i++) {
+ int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
+ int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0;
+
+ vm_mem_region_set_flags(vm, slot, flags);
+ }
+}
+
+void memstress_enable_dirty_logging(struct kvm_vm *vm, int slots)
+{
+ toggle_dirty_logging(vm, slots, true);
+}
+
+void memstress_disable_dirty_logging(struct kvm_vm *vm, int slots)
+{
+ toggle_dirty_logging(vm, slots, false);
+}
+
+void memstress_get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots)
+{
+ int i;
+
+ for (i = 0; i < slots; i++) {
+ int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
+
+ kvm_vm_get_dirty_log(vm, slot, bitmaps[i]);
+ }
+}
+
+void memstress_clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[],
+ int slots, uint64_t pages_per_slot)
+{
+ int i;
+
+ for (i = 0; i < slots; i++) {
+ int slot = MEMSTRESS_MEM_SLOT_INDEX + i;
+
+ kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot);
+ }
+}
+
+unsigned long **memstress_alloc_bitmaps(int slots, uint64_t pages_per_slot)
+{
+ unsigned long **bitmaps;
+ int i;
+
+ bitmaps = malloc(slots * sizeof(bitmaps[0]));
+ TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array.");
+
+ for (i = 0; i < slots; i++) {
+ bitmaps[i] = bitmap_zalloc(pages_per_slot);
+ TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap.");
+ }
+
+ return bitmaps;
+}
+
+void memstress_free_bitmaps(unsigned long *bitmaps[], int slots)
+{
+ int i;
+
+ for (i = 0; i < slots; i++)
+ free(bitmaps[i]);
+
+ free(bitmaps);
+}
diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
index 92cef20902f1..271f63891581 100644
--- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c
+++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c
@@ -70,7 +70,7 @@ static void *uffd_handler_thread_fn(void *arg)
r = read(pollfd[1].fd, &tmp_chr, 1);
TEST_ASSERT(r == 1,
"Error reading pipefd in UFFD thread\n");
- return NULL;
+ break;
}
if (!(pollfd[0].revents & POLLIN))
@@ -103,7 +103,7 @@ static void *uffd_handler_thread_fn(void *arg)
ts_diff = timespec_elapsed(start);
PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
pages, ts_diff.tv_sec, ts_diff.tv_nsec,
- pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+ pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC));
return NULL;
}
diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c
new file mode 100644
index 000000000000..1d73e78e8fa7
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/cmma_test.c
@@ -0,0 +1,700 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for s390x CMMA migration
+ *
+ * Copyright IBM Corp. 2023
+ *
+ * Authors:
+ * Nico Boehr <nrb@linux.ibm.com>
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+
+#define MAIN_PAGE_COUNT 512
+
+#define TEST_DATA_PAGE_COUNT 512
+#define TEST_DATA_MEMSLOT 1
+#define TEST_DATA_START_GFN 4096
+
+#define TEST_DATA_TWO_PAGE_COUNT 256
+#define TEST_DATA_TWO_MEMSLOT 2
+#define TEST_DATA_TWO_START_GFN 8192
+
+static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT];
+
+/**
+ * Dirty CMMA attributes of exactly one page in the TEST_DATA memslot,
+ * so use_cmma goes on and the CMMA related ioctls do something.
+ */
+static void guest_do_one_essa(void)
+{
+ asm volatile(
+ /* load TEST_DATA_START_GFN into r1 */
+ " llilf 1,%[start_gfn]\n"
+ /* calculate the address from the gfn */
+ " sllg 1,1,12(0)\n"
+ /* set the first page in TEST_DATA memslot to STABLE */
+ " .insn rrf,0xb9ab0000,2,1,1,0\n"
+ /* hypercall */
+ " diag 0,0,0x501\n"
+ "0: j 0b"
+ :
+ : [start_gfn] "L"(TEST_DATA_START_GFN)
+ : "r1", "r2", "memory", "cc"
+ );
+}
+
+/**
+ * Touch CMMA attributes of all pages in TEST_DATA memslot. Set them to stable
+ * state.
+ */
+static void guest_dirty_test_data(void)
+{
+ asm volatile(
+ /* r1 = TEST_DATA_START_GFN */
+ " xgr 1,1\n"
+ " llilf 1,%[start_gfn]\n"
+ /* r5 = TEST_DATA_PAGE_COUNT */
+ " lghi 5,%[page_count]\n"
+ /* r5 += r1 */
+ "2: agfr 5,1\n"
+ /* r2 = r1 << 12 */
+ "1: sllg 2,1,12(0)\n"
+ /* essa(r4, r2, SET_STABLE) */
+ " .insn rrf,0xb9ab0000,4,2,1,0\n"
+ /* i++ */
+ " agfi 1,1\n"
+ /* if r1 < r5 goto 1 */
+ " cgrjl 1,5,1b\n"
+ /* hypercall */
+ " diag 0,0,0x501\n"
+ "0: j 0b"
+ :
+ : [start_gfn] "L"(TEST_DATA_START_GFN),
+ [page_count] "L"(TEST_DATA_PAGE_COUNT)
+ :
+ /* the counter in our loop over the pages */
+ "r1",
+ /* the calculated page physical address */
+ "r2",
+ /* ESSA output register */
+ "r4",
+ /* last page */
+ "r5",
+ "cc", "memory"
+ );
+}
+
+static struct kvm_vm *create_vm(void)
+{
+ return ____vm_create(VM_MODE_DEFAULT);
+}
+
+static void create_main_memslot(struct kvm_vm *vm)
+{
+ int i;
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, MAIN_PAGE_COUNT, 0);
+ /* set the array of memslots to zero like __vm_create does */
+ for (i = 0; i < NR_MEM_REGIONS; i++)
+ vm->memslots[i] = 0;
+}
+
+static void create_test_memslot(struct kvm_vm *vm)
+{
+ vm_userspace_mem_region_add(vm,
+ VM_MEM_SRC_ANONYMOUS,
+ TEST_DATA_START_GFN << vm->page_shift,
+ TEST_DATA_MEMSLOT,
+ TEST_DATA_PAGE_COUNT,
+ 0
+ );
+ vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT;
+}
+
+static void create_memslots(struct kvm_vm *vm)
+{
+ /*
+ * Our VM has the following memory layout:
+ * +------+---------------------------+
+ * | GFN | Memslot |
+ * +------+---------------------------+
+ * | 0 | |
+ * | ... | MAIN (Code, Stack, ...) |
+ * | 511 | |
+ * +------+---------------------------+
+ * | 4096 | |
+ * | ... | TEST_DATA |
+ * | 4607 | |
+ * +------+---------------------------+
+ */
+ create_main_memslot(vm);
+ create_test_memslot(vm);
+}
+
+static void finish_vm_setup(struct kvm_vm *vm)
+{
+ struct userspace_mem_region *slot0;
+
+ kvm_vm_elf_load(vm, program_invocation_name);
+
+ slot0 = memslot2region(vm, 0);
+ ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
+
+ kvm_arch_vm_post_create(vm);
+}
+
+static struct kvm_vm *create_vm_two_memslots(void)
+{
+ struct kvm_vm *vm;
+
+ vm = create_vm();
+
+ create_memslots(vm);
+
+ finish_vm_setup(vm);
+
+ return vm;
+}
+
+static void enable_cmma(struct kvm_vm *vm)
+{
+ int r;
+
+ r = __kvm_device_attr_set(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA, NULL);
+ TEST_ASSERT(!r, "enabling cmma failed r=%d errno=%d", r, errno);
+}
+
+static void enable_dirty_tracking(struct kvm_vm *vm)
+{
+ vm_mem_region_set_flags(vm, 0, KVM_MEM_LOG_DIRTY_PAGES);
+ vm_mem_region_set_flags(vm, TEST_DATA_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES);
+}
+
+static int __enable_migration_mode(struct kvm_vm *vm)
+{
+ return __kvm_device_attr_set(vm->fd,
+ KVM_S390_VM_MIGRATION,
+ KVM_S390_VM_MIGRATION_START,
+ NULL
+ );
+}
+
+static void enable_migration_mode(struct kvm_vm *vm)
+{
+ int r = __enable_migration_mode(vm);
+
+ TEST_ASSERT(!r, "enabling migration mode failed r=%d errno=%d", r, errno);
+}
+
+static bool is_migration_mode_on(struct kvm_vm *vm)
+{
+ u64 out;
+ int r;
+
+ r = __kvm_device_attr_get(vm->fd,
+ KVM_S390_VM_MIGRATION,
+ KVM_S390_VM_MIGRATION_STATUS,
+ &out
+ );
+ TEST_ASSERT(!r, "getting migration mode status failed r=%d errno=%d", r, errno);
+ return out;
+}
+
+static int vm_get_cmma_bits(struct kvm_vm *vm, u64 flags, int *errno_out)
+{
+ struct kvm_s390_cmma_log args;
+ int rc;
+
+ errno = 0;
+
+ args = (struct kvm_s390_cmma_log){
+ .start_gfn = 0,
+ .count = sizeof(cmma_value_buf),
+ .flags = flags,
+ .values = (__u64)&cmma_value_buf[0]
+ };
+ rc = __vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+
+ *errno_out = errno;
+ return rc;
+}
+
+static void test_get_cmma_basic(void)
+{
+ struct kvm_vm *vm = create_vm_two_memslots();
+ struct kvm_vcpu *vcpu;
+ int rc, errno_out;
+
+ /* GET_CMMA_BITS without CMMA enabled should fail */
+ rc = vm_get_cmma_bits(vm, 0, &errno_out);
+ ASSERT_EQ(rc, -1);
+ ASSERT_EQ(errno_out, ENXIO);
+
+ enable_cmma(vm);
+ vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
+
+ vcpu_run(vcpu);
+
+ /* GET_CMMA_BITS without migration mode and without peeking should fail */
+ rc = vm_get_cmma_bits(vm, 0, &errno_out);
+ ASSERT_EQ(rc, -1);
+ ASSERT_EQ(errno_out, EINVAL);
+
+ /* GET_CMMA_BITS without migration mode and with peeking should work */
+ rc = vm_get_cmma_bits(vm, KVM_S390_CMMA_PEEK, &errno_out);
+ ASSERT_EQ(rc, 0);
+ ASSERT_EQ(errno_out, 0);
+
+ enable_dirty_tracking(vm);
+ enable_migration_mode(vm);
+
+ /* GET_CMMA_BITS with invalid flags */
+ rc = vm_get_cmma_bits(vm, 0xfeedc0fe, &errno_out);
+ ASSERT_EQ(rc, -1);
+ ASSERT_EQ(errno_out, EINVAL);
+
+ kvm_vm_free(vm);
+}
+
+static void assert_exit_was_hypercall(struct kvm_vcpu *vcpu)
+{
+ ASSERT_EQ(vcpu->run->exit_reason, 13);
+ ASSERT_EQ(vcpu->run->s390_sieic.icptcode, 4);
+ ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x8300);
+ ASSERT_EQ(vcpu->run->s390_sieic.ipb, 0x5010000);
+}
+
+static void test_migration_mode(void)
+{
+ struct kvm_vm *vm = create_vm();
+ struct kvm_vcpu *vcpu;
+ u64 orig_psw;
+ int rc;
+
+ /* enabling migration mode on a VM without memory should fail */
+ rc = __enable_migration_mode(vm);
+ ASSERT_EQ(rc, -1);
+ ASSERT_EQ(errno, EINVAL);
+ TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off");
+ errno = 0;
+
+ create_memslots(vm);
+ finish_vm_setup(vm);
+
+ enable_cmma(vm);
+ vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
+ orig_psw = vcpu->run->psw_addr;
+
+ /*
+ * Execute one essa instruction in the guest. Otherwise the guest will
+ * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
+ */
+ vcpu_run(vcpu);
+ assert_exit_was_hypercall(vcpu);
+
+ /* migration mode when memslots have dirty tracking off should fail */
+ rc = __enable_migration_mode(vm);
+ ASSERT_EQ(rc, -1);
+ ASSERT_EQ(errno, EINVAL);
+ TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off");
+ errno = 0;
+
+ /* enable dirty tracking */
+ enable_dirty_tracking(vm);
+
+ /* enabling migration mode should work now */
+ rc = __enable_migration_mode(vm);
+ ASSERT_EQ(rc, 0);
+ TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+ errno = 0;
+
+ /* execute another ESSA instruction to see this goes fine */
+ vcpu->run->psw_addr = orig_psw;
+ vcpu_run(vcpu);
+ assert_exit_was_hypercall(vcpu);
+
+ /*
+ * With migration mode on, create a new memslot with dirty tracking off.
+ * This should turn off migration mode.
+ */
+ TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+ vm_userspace_mem_region_add(vm,
+ VM_MEM_SRC_ANONYMOUS,
+ TEST_DATA_TWO_START_GFN << vm->page_shift,
+ TEST_DATA_TWO_MEMSLOT,
+ TEST_DATA_TWO_PAGE_COUNT,
+ 0
+ );
+ TEST_ASSERT(!is_migration_mode_on(vm),
+ "creating memslot without dirty tracking turns off migration mode"
+ );
+
+ /* ESSA instructions should still execute fine */
+ vcpu->run->psw_addr = orig_psw;
+ vcpu_run(vcpu);
+ assert_exit_was_hypercall(vcpu);
+
+ /*
+ * Turn on dirty tracking on the new memslot.
+ * It should be possible to turn migration mode back on again.
+ */
+ vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES);
+ rc = __enable_migration_mode(vm);
+ ASSERT_EQ(rc, 0);
+ TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+ errno = 0;
+
+ /*
+ * Turn off dirty tracking again, this time with just a flag change.
+ * Again, migration mode should turn off.
+ */
+ TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on");
+ vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, 0);
+ TEST_ASSERT(!is_migration_mode_on(vm),
+ "disabling dirty tracking should turn off migration mode"
+ );
+
+ /* ESSA instructions should still execute fine */
+ vcpu->run->psw_addr = orig_psw;
+ vcpu_run(vcpu);
+ assert_exit_was_hypercall(vcpu);
+
+ kvm_vm_free(vm);
+}
+
+/**
+ * Given a VM with the MAIN and TEST_DATA memslot, assert that both slots have
+ * CMMA attributes of all pages in both memslots and nothing more dirty.
+ * This has the useful side effect of ensuring nothing is CMMA dirty after this
+ * function.
+ */
+static void assert_all_slots_cmma_dirty(struct kvm_vm *vm)
+{
+ struct kvm_s390_cmma_log args;
+
+ /*
+ * First iteration - everything should be dirty.
+ * Start at the main memslot...
+ */
+ args = (struct kvm_s390_cmma_log){
+ .start_gfn = 0,
+ .count = sizeof(cmma_value_buf),
+ .flags = 0,
+ .values = (__u64)&cmma_value_buf[0]
+ };
+ memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+ vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+ ASSERT_EQ(args.count, MAIN_PAGE_COUNT);
+ ASSERT_EQ(args.remaining, TEST_DATA_PAGE_COUNT);
+ ASSERT_EQ(args.start_gfn, 0);
+
+ /* ...and then - after a hole - the TEST_DATA memslot should follow */
+ args = (struct kvm_s390_cmma_log){
+ .start_gfn = MAIN_PAGE_COUNT,
+ .count = sizeof(cmma_value_buf),
+ .flags = 0,
+ .values = (__u64)&cmma_value_buf[0]
+ };
+ memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+ vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+ ASSERT_EQ(args.count, TEST_DATA_PAGE_COUNT);
+ ASSERT_EQ(args.start_gfn, TEST_DATA_START_GFN);
+ ASSERT_EQ(args.remaining, 0);
+
+ /* ...and nothing else should be there */
+ args = (struct kvm_s390_cmma_log){
+ .start_gfn = TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT,
+ .count = sizeof(cmma_value_buf),
+ .flags = 0,
+ .values = (__u64)&cmma_value_buf[0]
+ };
+ memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+ vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+ ASSERT_EQ(args.count, 0);
+ ASSERT_EQ(args.start_gfn, 0);
+ ASSERT_EQ(args.remaining, 0);
+}
+
+/**
+ * Given a VM, assert no pages are CMMA dirty.
+ */
+static void assert_no_pages_cmma_dirty(struct kvm_vm *vm)
+{
+ struct kvm_s390_cmma_log args;
+
+ /* If we start from GFN 0 again, nothing should be dirty. */
+ args = (struct kvm_s390_cmma_log){
+ .start_gfn = 0,
+ .count = sizeof(cmma_value_buf),
+ .flags = 0,
+ .values = (__u64)&cmma_value_buf[0]
+ };
+ memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+ vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args);
+ if (args.count || args.remaining || args.start_gfn)
+ TEST_FAIL("pages are still dirty start_gfn=0x%llx count=%u remaining=%llu",
+ args.start_gfn,
+ args.count,
+ args.remaining
+ );
+}
+
+static void test_get_inital_dirty(void)
+{
+ struct kvm_vm *vm = create_vm_two_memslots();
+ struct kvm_vcpu *vcpu;
+
+ enable_cmma(vm);
+ vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa);
+
+ /*
+ * Execute one essa instruction in the guest. Otherwise the guest will
+ * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
+ */
+ vcpu_run(vcpu);
+ assert_exit_was_hypercall(vcpu);
+
+ enable_dirty_tracking(vm);
+ enable_migration_mode(vm);
+
+ assert_all_slots_cmma_dirty(vm);
+
+ /* Start from the beginning again and make sure nothing else is dirty */
+ assert_no_pages_cmma_dirty(vm);
+
+ kvm_vm_free(vm);
+}
+
+static void query_cmma_range(struct kvm_vm *vm,
+ u64 start_gfn, u64 gfn_count,
+ struct kvm_s390_cmma_log *res_out)
+{
+ *res_out = (struct kvm_s390_cmma_log){
+ .start_gfn = start_gfn,
+ .count = gfn_count,
+ .flags = 0,
+ .values = (__u64)&cmma_value_buf[0]
+ };
+ memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf));
+ vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, res_out);
+}
+
+/**
+ * Assert the given cmma_log struct that was executed by query_cmma_range()
+ * indicates the first dirty gfn is at first_dirty_gfn and contains exactly
+ * dirty_gfn_count CMMA values.
+ */
+static void assert_cmma_dirty(u64 first_dirty_gfn,
+ u64 dirty_gfn_count,
+ const struct kvm_s390_cmma_log *res)
+{
+ ASSERT_EQ(res->start_gfn, first_dirty_gfn);
+ ASSERT_EQ(res->count, dirty_gfn_count);
+ for (size_t i = 0; i < dirty_gfn_count; i++)
+ ASSERT_EQ(cmma_value_buf[0], 0x0); /* stable state */
+ ASSERT_EQ(cmma_value_buf[dirty_gfn_count], 0xff); /* not touched */
+}
+
+static void test_get_skip_holes(void)
+{
+ size_t gfn_offset;
+ struct kvm_vm *vm = create_vm_two_memslots();
+ struct kvm_s390_cmma_log log;
+ struct kvm_vcpu *vcpu;
+ u64 orig_psw;
+
+ enable_cmma(vm);
+ vcpu = vm_vcpu_add(vm, 1, guest_dirty_test_data);
+
+ orig_psw = vcpu->run->psw_addr;
+
+ /*
+ * Execute some essa instructions in the guest. Otherwise the guest will
+ * not have use_cmm enabled and GET_CMMA_BITS will return no pages.
+ */
+ vcpu_run(vcpu);
+ assert_exit_was_hypercall(vcpu);
+
+ enable_dirty_tracking(vm);
+ enable_migration_mode(vm);
+
+ /* un-dirty all pages */
+ assert_all_slots_cmma_dirty(vm);
+
+ /* Then, dirty just the TEST_DATA memslot */
+ vcpu->run->psw_addr = orig_psw;
+ vcpu_run(vcpu);
+
+ gfn_offset = TEST_DATA_START_GFN;
+ /**
+ * Query CMMA attributes of one page, starting at page 0. Since the
+ * main memslot was not touched by the VM, this should yield the first
+ * page of the TEST_DATA memslot.
+ * The dirty bitmap should now look like this:
+ * 0: not dirty
+ * [0x1, 0x200): dirty
+ */
+ query_cmma_range(vm, 0, 1, &log);
+ assert_cmma_dirty(gfn_offset, 1, &log);
+ gfn_offset++;
+
+ /**
+ * Query CMMA attributes of 32 (0x20) pages past the end of the TEST_DATA
+ * memslot. This should wrap back to the beginning of the TEST_DATA
+ * memslot, page 1.
+ * The dirty bitmap should now look like this:
+ * [0, 0x21): not dirty
+ * [0x21, 0x200): dirty
+ */
+ query_cmma_range(vm, TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT, 0x20, &log);
+ assert_cmma_dirty(gfn_offset, 0x20, &log);
+ gfn_offset += 0x20;
+
+ /* Skip 32 pages */
+ gfn_offset += 0x20;
+
+ /**
+ * After skipping 32 pages, query the next 32 (0x20) pages.
+ * The dirty bitmap should now look like this:
+ * [0, 0x21): not dirty
+ * [0x21, 0x41): dirty
+ * [0x41, 0x61): not dirty
+ * [0x61, 0x200): dirty
+ */
+ query_cmma_range(vm, gfn_offset, 0x20, &log);
+ assert_cmma_dirty(gfn_offset, 0x20, &log);
+ gfn_offset += 0x20;
+
+ /**
+ * Query 1 page from the beginning of the TEST_DATA memslot. This should
+ * yield page 0x21.
+ * The dirty bitmap should now look like this:
+ * [0, 0x22): not dirty
+ * [0x22, 0x41): dirty
+ * [0x41, 0x61): not dirty
+ * [0x61, 0x200): dirty
+ */
+ query_cmma_range(vm, TEST_DATA_START_GFN, 1, &log);
+ assert_cmma_dirty(TEST_DATA_START_GFN + 0x21, 1, &log);
+ gfn_offset++;
+
+ /**
+ * Query 15 (0xF) pages from page 0x23 in TEST_DATA memslot.
+ * This should yield pages [0x23, 0x33).
+ * The dirty bitmap should now look like this:
+ * [0, 0x22): not dirty
+ * 0x22: dirty
+ * [0x23, 0x33): not dirty
+ * [0x33, 0x41): dirty
+ * [0x41, 0x61): not dirty
+ * [0x61, 0x200): dirty
+ */
+ gfn_offset = TEST_DATA_START_GFN + 0x23;
+ query_cmma_range(vm, gfn_offset, 15, &log);
+ assert_cmma_dirty(gfn_offset, 15, &log);
+
+ /**
+ * Query 17 (0x11) pages from page 0x22 in TEST_DATA memslot.
+ * This should yield page [0x22, 0x33)
+ * The dirty bitmap should now look like this:
+ * [0, 0x33): not dirty
+ * [0x33, 0x41): dirty
+ * [0x41, 0x61): not dirty
+ * [0x61, 0x200): dirty
+ */
+ gfn_offset = TEST_DATA_START_GFN + 0x22;
+ query_cmma_range(vm, gfn_offset, 17, &log);
+ assert_cmma_dirty(gfn_offset, 17, &log);
+
+ /**
+ * Query 25 (0x19) pages from page 0x40 in TEST_DATA memslot.
+ * This should yield page 0x40 and nothing more, since there are more
+ * than 16 non-dirty pages after page 0x40.
+ * The dirty bitmap should now look like this:
+ * [0, 0x33): not dirty
+ * [0x33, 0x40): dirty
+ * [0x40, 0x61): not dirty
+ * [0x61, 0x200): dirty
+ */
+ gfn_offset = TEST_DATA_START_GFN + 0x40;
+ query_cmma_range(vm, gfn_offset, 25, &log);
+ assert_cmma_dirty(gfn_offset, 1, &log);
+
+ /**
+ * Query pages [0x33, 0x40).
+ * The dirty bitmap should now look like this:
+ * [0, 0x61): not dirty
+ * [0x61, 0x200): dirty
+ */
+ gfn_offset = TEST_DATA_START_GFN + 0x33;
+ query_cmma_range(vm, gfn_offset, 0x40 - 0x33, &log);
+ assert_cmma_dirty(gfn_offset, 0x40 - 0x33, &log);
+
+ /**
+ * Query the remaining pages [0x61, 0x200).
+ */
+ gfn_offset = TEST_DATA_START_GFN;
+ query_cmma_range(vm, gfn_offset, TEST_DATA_PAGE_COUNT - 0x61, &log);
+ assert_cmma_dirty(TEST_DATA_START_GFN + 0x61, TEST_DATA_PAGE_COUNT - 0x61, &log);
+
+ assert_no_pages_cmma_dirty(vm);
+}
+
+struct testdef {
+ const char *name;
+ void (*test)(void);
+} testlist[] = {
+ { "migration mode and dirty tracking", test_migration_mode },
+ { "GET_CMMA_BITS: basic calls", test_get_cmma_basic },
+ { "GET_CMMA_BITS: all pages are dirty initally", test_get_inital_dirty },
+ { "GET_CMMA_BITS: holes are skipped", test_get_skip_holes },
+};
+
+/**
+ * The kernel may support CMMA, but the machine may not (i.e. if running as
+ * guest-3).
+ *
+ * In this case, the CMMA capabilities are all there, but the CMMA-related
+ * ioctls fail. To find out whether the machine supports CMMA, create a
+ * temporary VM and then query the CMMA feature of the VM.
+ */
+static int machine_has_cmma(void)
+{
+ struct kvm_vm *vm = create_vm();
+ int r;
+
+ r = !__kvm_has_device_attr(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA);
+ kvm_vm_free(vm);
+
+ return r;
+}
+
+int main(int argc, char *argv[])
+{
+ int idx;
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS));
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_CMMA_MIGRATION));
+ TEST_REQUIRE(machine_has_cmma());
+
+ ksft_print_header();
+
+ ksft_set_plan(ARRAY_SIZE(testlist));
+
+ for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) {
+ testlist[idx].test();
+ ksft_test_result_pass("%s\n", testlist[idx].name);
+ }
+
+ ksft_finished(); /* Print results and exit() accordingly */
+}
diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
index 2fc3ad9c887e..d3c3aa93f090 100644
--- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
@@ -163,6 +163,25 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu)
ent->eax = eax;
}
+static void test_get_cpuid2(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent + 1);
+ int i, r;
+
+ vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
+ TEST_ASSERT(cpuid->nent == vcpu->cpuid->nent,
+ "KVM didn't update nent on success, wanted %u, got %u\n",
+ vcpu->cpuid->nent, cpuid->nent);
+
+ for (i = 0; i < vcpu->cpuid->nent; i++) {
+ cpuid->nent = i;
+ r = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
+ TEST_ASSERT(r && errno == E2BIG, KVM_IOCTL_ERROR(KVM_GET_CPUID2, r));
+ TEST_ASSERT(cpuid->nent == i, "KVM modified nent on failure");
+ }
+ free(cpuid);
+}
+
int main(void)
{
struct kvm_vcpu *vcpu;
@@ -183,5 +202,7 @@ int main(void)
set_cpuid_after_run(vcpu);
+ test_get_cpuid2(vcpu);
+
kvm_vm_free(vm);
}
diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c
new file mode 100644
index 000000000000..beb7e2c10211
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty logging page splitting test
+ *
+ * Based on dirty_log_perf.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2023, Google, Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+
+#include "kvm_util.h"
+#include "test_util.h"
+#include "memstress.h"
+#include "guest_modes.h"
+
+#define VCPUS 2
+#define SLOTS 2
+#define ITERATIONS 2
+
+static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
+static enum vm_mem_backing_src_type backing_src = VM_MEM_SRC_ANONYMOUS_HUGETLB;
+
+static u64 dirty_log_manual_caps;
+static bool host_quit;
+static int iteration;
+static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
+
+struct kvm_page_stats {
+ uint64_t pages_4k;
+ uint64_t pages_2m;
+ uint64_t pages_1g;
+ uint64_t hugepages;
+};
+
+static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats, const char *stage)
+{
+ stats->pages_4k = vm_get_stat(vm, "pages_4k");
+ stats->pages_2m = vm_get_stat(vm, "pages_2m");
+ stats->pages_1g = vm_get_stat(vm, "pages_1g");
+ stats->hugepages = stats->pages_2m + stats->pages_1g;
+
+ pr_debug("\nPage stats after %s: 4K: %ld 2M: %ld 1G: %ld huge: %ld\n",
+ stage, stats->pages_4k, stats->pages_2m, stats->pages_1g,
+ stats->hugepages);
+}
+
+static void run_vcpu_iteration(struct kvm_vm *vm)
+{
+ int i;
+
+ iteration++;
+ for (i = 0; i < VCPUS; i++) {
+ while (READ_ONCE(vcpu_last_completed_iteration[i]) !=
+ iteration)
+ ;
+ }
+}
+
+static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
+{
+ struct kvm_vcpu *vcpu = vcpu_args->vcpu;
+ int vcpu_idx = vcpu_args->vcpu_idx;
+
+ while (!READ_ONCE(host_quit)) {
+ int current_iteration = READ_ONCE(iteration);
+
+ vcpu_run(vcpu);
+
+ ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
+
+ vcpu_last_completed_iteration[vcpu_idx] = current_iteration;
+
+ /* Wait for the start of the next iteration to be signaled. */
+ while (current_iteration == READ_ONCE(iteration) &&
+ READ_ONCE(iteration) >= 0 &&
+ !READ_ONCE(host_quit))
+ ;
+ }
+}
+
+static void run_test(enum vm_guest_mode mode, void *unused)
+{
+ struct kvm_vm *vm;
+ unsigned long **bitmaps;
+ uint64_t guest_num_pages;
+ uint64_t host_num_pages;
+ uint64_t pages_per_slot;
+ int i;
+ uint64_t total_4k_pages;
+ struct kvm_page_stats stats_populated;
+ struct kvm_page_stats stats_dirty_logging_enabled;
+ struct kvm_page_stats stats_dirty_pass[ITERATIONS];
+ struct kvm_page_stats stats_clear_pass[ITERATIONS];
+ struct kvm_page_stats stats_dirty_logging_disabled;
+ struct kvm_page_stats stats_repopulated;
+
+ vm = memstress_create_vm(mode, VCPUS, guest_percpu_mem_size,
+ SLOTS, backing_src, false);
+
+ guest_num_pages = (VCPUS * guest_percpu_mem_size) >> vm->page_shift;
+ guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+ host_num_pages = vm_num_host_pages(mode, guest_num_pages);
+ pages_per_slot = host_num_pages / SLOTS;
+
+ bitmaps = memstress_alloc_bitmaps(SLOTS, pages_per_slot);
+
+ if (dirty_log_manual_caps)
+ vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
+ dirty_log_manual_caps);
+
+ /* Start the iterations */
+ iteration = -1;
+ host_quit = false;
+
+ for (i = 0; i < VCPUS; i++)
+ vcpu_last_completed_iteration[i] = -1;
+
+ memstress_start_vcpu_threads(VCPUS, vcpu_worker);
+
+ run_vcpu_iteration(vm);
+ get_page_stats(vm, &stats_populated, "populating memory");
+
+ /* Enable dirty logging */
+ memstress_enable_dirty_logging(vm, SLOTS);
+
+ get_page_stats(vm, &stats_dirty_logging_enabled, "enabling dirty logging");
+
+ while (iteration < ITERATIONS) {
+ run_vcpu_iteration(vm);
+ get_page_stats(vm, &stats_dirty_pass[iteration - 1],
+ "dirtying memory");
+
+ memstress_get_dirty_log(vm, bitmaps, SLOTS);
+
+ if (dirty_log_manual_caps) {
+ memstress_clear_dirty_log(vm, bitmaps, SLOTS, pages_per_slot);
+
+ get_page_stats(vm, &stats_clear_pass[iteration - 1], "clearing dirty log");
+ }
+ }
+
+ /* Disable dirty logging */
+ memstress_disable_dirty_logging(vm, SLOTS);
+
+ get_page_stats(vm, &stats_dirty_logging_disabled, "disabling dirty logging");
+
+ /* Run vCPUs again to fault pages back in. */
+ run_vcpu_iteration(vm);
+ get_page_stats(vm, &stats_repopulated, "repopulating memory");
+
+ /*
+ * Tell the vCPU threads to quit. No need to manually check that vCPUs
+ * have stopped running after disabling dirty logging, the join will
+ * wait for them to exit.
+ */
+ host_quit = true;
+ memstress_join_vcpu_threads(VCPUS);
+
+ memstress_free_bitmaps(bitmaps, SLOTS);
+ memstress_destroy_vm(vm);
+
+ /* Make assertions about the page counts. */
+ total_4k_pages = stats_populated.pages_4k;
+ total_4k_pages += stats_populated.pages_2m * 512;
+ total_4k_pages += stats_populated.pages_1g * 512 * 512;
+
+ /*
+ * Check that all huge pages were split. Since large pages can only
+ * exist in the data slot, and the vCPUs should have dirtied all pages
+ * in the data slot, there should be no huge pages left after splitting.
+ * Splitting happens at dirty log enable time without
+ * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
+ * with that capability.
+ */
+ if (dirty_log_manual_caps) {
+ ASSERT_EQ(stats_clear_pass[0].hugepages, 0);
+ ASSERT_EQ(stats_clear_pass[0].pages_4k, total_4k_pages);
+ ASSERT_EQ(stats_dirty_logging_enabled.hugepages, stats_populated.hugepages);
+ } else {
+ ASSERT_EQ(stats_dirty_logging_enabled.hugepages, 0);
+ ASSERT_EQ(stats_dirty_logging_enabled.pages_4k, total_4k_pages);
+ }
+
+ /*
+ * Once dirty logging is disabled and the vCPUs have touched all their
+ * memory again, the page counts should be the same as they were
+ * right after initial population of memory.
+ */
+ ASSERT_EQ(stats_populated.pages_4k, stats_repopulated.pages_4k);
+ ASSERT_EQ(stats_populated.pages_2m, stats_repopulated.pages_2m);
+ ASSERT_EQ(stats_populated.pages_1g, stats_repopulated.pages_1g);
+}
+
+static void help(char *name)
+{
+ puts("");
+ printf("usage: %s [-h] [-b vcpu bytes] [-s mem type]\n",
+ name);
+ puts("");
+ printf(" -b: specify the size of the memory region which should be\n"
+ " dirtied by each vCPU. e.g. 10M or 3G.\n"
+ " (default: 1G)\n");
+ backing_src_help("-s");
+ puts("");
+}
+
+int main(int argc, char *argv[])
+{
+ int opt;
+
+ TEST_REQUIRE(get_kvm_param_bool("eager_page_split"));
+ TEST_REQUIRE(get_kvm_param_bool("tdp_mmu"));
+
+ while ((opt = getopt(argc, argv, "b:hs:")) != -1) {
+ switch (opt) {
+ case 'b':
+ guest_percpu_mem_size = parse_size(optarg);
+ break;
+ case 'h':
+ help(argv[0]);
+ exit(0);
+ case 's':
+ backing_src = parse_backing_src_type(optarg);
+ break;
+ default:
+ help(argv[0]);
+ exit(1);
+ }
+ }
+
+ if (!is_backing_src_hugetlb(backing_src)) {
+ pr_info("This test will only work reliably with HugeTLB memory. "
+ "It can work with THP, but that is best effort.\n");
+ }
+
+ guest_modes_append_default();
+
+ dirty_log_manual_caps = 0;
+ for_each_guest_mode(run_test, NULL);
+
+ dirty_log_manual_caps =
+ kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+
+ if (dirty_log_manual_caps) {
+ dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+ KVM_DIRTY_LOG_INITIALLY_SET);
+ for_each_guest_mode(run_test, NULL);
+ } else {
+ pr_info("Skipping testing with MANUAL_PROTECT as it is not supported");
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
index 251794f83719..7f36c32fa760 100644
--- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
+++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
@@ -226,7 +226,7 @@ static void help(char *name)
puts("");
printf("usage: %s [-h] [-p period_ms] [-t token]\n", name);
puts("");
- printf(" -p: The NX reclaim period in miliseconds.\n");
+ printf(" -p: The NX reclaim period in milliseconds.\n");
printf(" -t: The magic token to indicate environment setup is done.\n");
printf(" -r: The test has reboot permissions and can disable NX huge pages.\n");
puts("");
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
index fa03c8d1ce4e..e710b6e7fb38 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
@@ -116,29 +116,21 @@ static void l1_guest_code(struct vmx_pages *vmx_pages)
GUEST_DONE();
}
-static void stable_tsc_check_supported(void)
+static bool system_has_stable_tsc(void)
{
+ bool tsc_is_stable;
FILE *fp;
char buf[4];
fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
if (fp == NULL)
- goto skip_test;
+ return false;
- if (fgets(buf, sizeof(buf), fp) == NULL)
- goto close_fp;
+ tsc_is_stable = fgets(buf, sizeof(buf), fp) &&
+ !strncmp(buf, "tsc", sizeof(buf));
- if (strncmp(buf, "tsc", sizeof(buf)))
- goto close_fp;
-
- fclose(fp);
- return;
-
-close_fp:
fclose(fp);
-skip_test:
- print_skip("Kernel does not use TSC clocksource - assuming that host TSC is not stable");
- exit(KSFT_SKIP);
+ return tsc_is_stable;
}
int main(int argc, char *argv[])
@@ -156,7 +148,7 @@ int main(int argc, char *argv[])
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL));
- stable_tsc_check_supported();
+ TEST_REQUIRE(system_has_stable_tsc());
/*
* We set L1's scale factor to be a random number from 2 to 10.
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 5ef88f5a0864..1b90acb6e3fe 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -186,15 +186,10 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
r = kvm_io_bus_unregister_dev(kvm,
zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
-
- kvm_iodevice_destructor(&dev->dev);
-
/*
* On failure, unregister destroys all devices on the
- * bus _except_ the target device, i.e. coalesced_zones
- * has been modified. Bail after destroying the target
- * device, there's no need to restart the walk as there
- * aren't any zones left.
+ * bus, including the target device. There's no need
+ * to restart the walk as there aren't any zones left.
*/
if (r)
break;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index b0af834ffa95..89912a17f5d5 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -889,9 +889,9 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
unlock_fail:
mutex_unlock(&kvm->slots_lock);
+ kfree(p);
fail:
- kfree(p);
eventfd_ctx_put(eventfd);
return ret;
@@ -901,7 +901,7 @@ static int
kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_ioeventfd *args)
{
- struct _ioeventfd *p, *tmp;
+ struct _ioeventfd *p;
struct eventfd_ctx *eventfd;
struct kvm_io_bus *bus;
int ret = -ENOENT;
@@ -915,8 +915,7 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
mutex_lock(&kvm->slots_lock);
- list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
-
+ list_for_each_entry(p, &kvm->ioeventfds, list) {
if (p->bus_idx != bus_idx ||
p->eventfd != eventfd ||
p->addr != args->addr ||
@@ -931,7 +930,6 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
bus = kvm_get_bus(kvm, bus_idx);
if (bus)
bus->ioeventfd_count--;
- ioeventfd_release(p);
ret = 0;
break;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 19f301ef23c9..dfbaafbe3a00 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -154,11 +154,6 @@ static unsigned long long kvm_active_vms;
static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
-__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end)
-{
-}
-
__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
}
@@ -521,18 +516,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
return container_of(mn, struct kvm, mmu_notifier);
}
-static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- struct kvm *kvm = mmu_notifier_to_kvm(mn);
- int idx;
-
- idx = srcu_read_lock(&kvm->srcu);
- kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
- srcu_read_unlock(&kvm->srcu, idx);
-}
-
typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
@@ -910,7 +893,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
- .invalidate_range = kvm_mmu_notifier_invalidate_range,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
@@ -3891,7 +3873,10 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
static int vcpu_get_pid(void *data, u64 *val)
{
struct kvm_vcpu *vcpu = data;
- *val = pid_nr(rcu_access_pointer(vcpu->pid));
+
+ rcu_read_lock();
+ *val = pid_nr(rcu_dereference(vcpu->pid));
+ rcu_read_unlock();
return 0;
}
@@ -3993,7 +3978,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
if (r < 0)
goto kvm_put_xa_release;
- if (KVM_BUG_ON(!!xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
+ if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) {
r = -EINVAL;
goto kvm_put_xa_release;
}
@@ -4623,7 +4608,7 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
return -EINVAL;
}
-static bool kvm_are_all_memslots_empty(struct kvm *kvm)
+bool kvm_are_all_memslots_empty(struct kvm *kvm)
{
int i;
@@ -4636,6 +4621,7 @@ static bool kvm_are_all_memslots_empty(struct kvm *kvm)
return true;
}
+EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
struct kvm_enable_cap *cap)
@@ -5315,6 +5301,12 @@ static void hardware_disable_all(void)
}
#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
+static void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+ if (dev->ops->destructor)
+ dev->ops->destructor(dev);
+}
+
static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
int i;
@@ -5538,7 +5530,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev)
{
- int i, j;
+ int i;
struct kvm_io_bus *new_bus, *bus;
lockdep_assert_held(&kvm->slots_lock);
@@ -5568,18 +5560,19 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
- /* Destroy the old bus _after_ installing the (null) bus. */
+ /*
+ * If NULL bus is installed, destroy the old bus, including all the
+ * attached devices. Otherwise, destroy the caller's device only.
+ */
if (!new_bus) {
pr_err("kvm: failed to shrink bus, removing it completely\n");
- for (j = 0; j < bus->dev_count; j++) {
- if (j == i)
- continue;
- kvm_iodevice_destructor(bus->range[j].dev);
- }
+ kvm_io_bus_destroy(bus);
+ return -ENOMEM;
}
+ kvm_iodevice_destructor(dev);
kfree(bus);
- return new_bus ? 0 : -ENOMEM;
+ return 0;
}
struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,