From a33ba1bf0dc6082227d1ddf964632e07b53b8971 Mon Sep 17 00:00:00 2001 From: Wenyao Hai Date: Thu, 11 May 2023 16:33:44 -0700 Subject: KVM: VMX: Open code writing vCPU's PAT in VMX's MSR handler Open code setting "vcpu->arch.pat" in vmx_set_msr() instead of bouncing through kvm_set_msr_common() to get to the same code in kvm_mtrr_set_msr(). This aligns VMX with SVM, avoids hiding a very simple operation behind a relatively complicated function call (finding the PAT MSR case in kvm_set_msr_common() is non-trivial), and most importantly, makes it clear that not unwinding the VMCS updates if kvm_set_msr_common() isn't a bug (because kvm_set_msr_common() can never fail for PAT). Opportunistically set vcpu->arch.pat before updating the VMCS info so that a future patch can move the common bits (back) into kvm_set_msr_common() without a functional change. Note, MSR_IA32_CR_PAT is 0x277, and is very subtly handled by case 0x200 ... MSR_IA32_MC0_CTL2 - 1: in kvm_set_msr_common(). Cc: Kai Huang <kai.huang@intel.com> Signed-off-by: Wenyao Hai <haiwenyao@uniontech.com> [sean: massage changelog, hoist setting vcpu->arch.pat up] Reviewed-by: Kai Huang <kai.huang@intel.com> Link: https://lore.kernel.org/r/20230511233351.635053-2-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/vmx/vmx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 44fb619803b8..33b8625d3541 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2290,16 +2290,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!kvm_pat_valid(data)) return 1; + vcpu->arch.pat = data; + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) get_vmcs12(vcpu)->guest_ia32_pat = data; - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, data); - vcpu->arch.pat = data; - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && -- cgit v1.2.3 From 7aeae027611ff27f13a32e19736c8fd06e41786c Mon Sep 17 00:00:00 2001 From: Ke Guo Date: Thu, 11 May 2023 16:33:45 -0700 Subject: KVM: SVM: Use kvm_pat_valid() directly instead of kvm_mtrr_valid() Use kvm_pat_valid() directly instead of bouncing through kvm_mtrr_valid(). The PAT is not an MTRR, and kvm_mtrr_valid() just redirects to kvm_pat_valid(), i.e. is exempt from KVM's "zap SPTEs" logic that's needed to honor guest MTRRs when the VM has a passthrough device with non-coherent DMA (KVM does NOT set "ignore guest PAT" in this case, and so enables hardware virtualization of the guest's PAT, i.e. doesn't need to manually emulate the PAT memtype). Signed-off-by: Ke Guo <guoke@uniontech.com> [sean: massage changelog] Reviewed-by: Kai Huang <kai.huang@intel.com> Link: https://lore.kernel.org/r/20230511233351.635053-3-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/svm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ca32389f3c36..e1e17c7dc7d5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2939,7 +2939,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_CR_PAT: - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + if (!kvm_pat_valid(data)) return 1; vcpu->arch.pat = data; svm->vmcb01.ptr->save.g_pat = data; -- cgit v1.2.3 From ebda79e5057778be1ad8ed072e4229894dfc66b7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:46 -0700 Subject: KVM: x86: Add helper to query if variable MTRR MSR is base (versus mask) Add a helper to query whether a variable MTRR MSR is a base versus as mask MSR. Replace the unnecessarily complex math with a simple check on bit 0; base MSRs are even, mask MSRs are odd. Link: https://lore.kernel.org/r/20230511233351.635053-4-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/mtrr.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 9fac1ec03463..f65ce4b3980f 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -25,6 +25,12 @@ #define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) #define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) +static bool is_mtrr_base_msr(unsigned int msr) +{ + /* MTRR base MSRs use even numbers, masks use odd numbers. */ + return !(msr & 0x1); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -342,10 +348,9 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; struct kvm_mtrr_range *tmp, *cur; - int index, is_mtrr_mask; + int index; index = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * index; cur = &mtrr_state->var_ranges[index]; /* remove the entry if it's in the list. */ @@ -356,7 +361,7 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) * Set all illegal GPA bits in the mask, since those bits must * implicitly be 0. The bits are then cleared when reading them. */ - if (!is_mtrr_mask) + if (is_mtrr_base_msr(msr)) cur->base = data; else cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); @@ -418,11 +423,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) else if (msr == MSR_IA32_CR_PAT) *pdata = vcpu->arch.pat; else { /* Variable MTRRs */ - int is_mtrr_mask; - index = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * index; - if (!is_mtrr_mask) + if (is_mtrr_base_msr(msr)) *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; else *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; -- cgit v1.2.3 From 9ae38b4fb13597ce1821376d23958bbe4976c759 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:47 -0700 Subject: KVM: x86: Add helper to get variable MTRR range from MSR index Add a helper to dedup the logic for retrieving a variable MTRR range structure given a variable MTRR MSR index. No functional change intended. Reviewed-by: Kai Huang <kai.huang@intel.com> Link: https://lore.kernel.org/r/20230511233351.635053-5-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/mtrr.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index f65ce4b3980f..59851dbebfea 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -31,6 +31,14 @@ static bool is_mtrr_base_msr(unsigned int msr) return !(msr & 0x1); } +static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, + unsigned int msr) +{ + int index = (msr - 0x200) / 2; + + return &vcpu->arch.mtrr_state.var_ranges[index]; +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -314,7 +322,6 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end; - int index; if (msr == MSR_IA32_CR_PAT || !tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) @@ -332,8 +339,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) end = ~0ULL; } else { /* variable range MTRRs. */ - index = (msr - 0x200) / 2; - var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end); + var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end); } kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); @@ -348,14 +354,12 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; struct kvm_mtrr_range *tmp, *cur; - int index; - index = (msr - 0x200) / 2; - cur = &mtrr_state->var_ranges[index]; + cur = var_mtrr_msr_to_range(vcpu, msr); /* remove the entry if it's in the list. */ if (var_mtrr_range_is_valid(cur)) - list_del(&mtrr_state->var_ranges[index].node); + list_del(&cur->node); /* * Set all illegal GPA bits in the mask, since those bits must @@ -423,11 +427,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) else if (msr == MSR_IA32_CR_PAT) *pdata = vcpu->arch.pat; else { /* Variable MTRRs */ - index = (msr - 0x200) / 2; if (is_mtrr_base_msr(msr)) - *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; + *pdata = var_mtrr_msr_to_range(vcpu, msr)->base; else - *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; + *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask; *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); } -- cgit v1.2.3 From 34a83deac31cd9fdecef331578422095af2db4b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:48 -0700 Subject: KVM: x86: Use MTRR macros to define possible MTRR MSR ranges Use the MTRR macros to identify the ranges of possible MTRR MSRs instead of bounding the ranges with a mismash of open coded values and unrelated MSR indices. Carving out the gap for the machine check MSRs in particular is confusing, as it's easy to incorrectly think the case statement handles MCE MSRs instead of skipping them. Drop the range-based funneling of MSRs between the end of the MCE MSRs and MTRR_DEF_TYPE, i.e. 0x2A0-0x2FF, and instead handle MTTR_DEF_TYPE as the one-off case that it is. Extract PAT (0x277) as well in anticipation of dropping PAT "handling" from the MTRR code. Keep the range-based handling for the variable+fixed MTRRs even though capturing unknown MSRs 0x214-0x24F is arguably "wrong". There is a gap in the fixed MTRRs, 0x260-0x267, i.e. the MTRR code needs to filter out unknown MSRs anyways, and using a single range generates marginally better code for the big switch statement. Reviewed-by: Kai Huang <kai.huang@intel.com> Link: https://lore.kernel.org/r/20230511233351.635053-6-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/mtrr.c | 7 ++++--- arch/x86/kvm/x86.c | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 59851dbebfea..dc213b940141 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -34,7 +34,7 @@ static bool is_mtrr_base_msr(unsigned int msr) static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, unsigned int msr) { - int index = (msr - 0x200) / 2; + int index = (msr - MTRRphysBase_MSR(0)) / 2; return &vcpu->arch.mtrr_state.var_ranges[index]; } @@ -42,7 +42,7 @@ static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, static bool msr_mtrr_valid(unsigned msr) { switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1): case MSR_MTRRfix64K_00000: case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: @@ -88,7 +88,8 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + WARN_ON(!(msr >= MTRRphysBase_MSR(0) && + msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))); mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0778ca39650..8017e0eb1e67 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3702,8 +3702,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... MSR_IA32_MC0_CTL2 - 1: - case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: + case MSR_IA32_CR_PAT: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -4110,9 +4111,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } + case MSR_IA32_CR_PAT: case MSR_MTRRcap: - case 0x200 ... MSR_IA32_MC0_CTL2 - 1: - case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; -- cgit v1.2.3 From bc7fe2f0b7511ef6bb2765b6e1f923da449e00ef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:49 -0700 Subject: KVM: x86: Move PAT MSR handling out of mtrr.c Drop handling of MSR_IA32_CR_PAT from mtrr.c now that SVM and VMX handle writes without bouncing through kvm_set_msr_common(). PAT isn't truly an MTRR even though it affects memory types, and more importantly KVM enables hardware virtualization of guest PAT (by NOT setting "ignore guest PAT") when a guest has non-coherent DMA, i.e. KVM doesn't need to zap SPTEs when the guest PAT changes. The read path is and always has been trivial, i.e. burying it in the MTRR code does more harm than good. WARN and continue for the PAT case in kvm_set_msr_common(), as that code is _currently_ reached if and only if KVM is buggy. Defer cleaning up the lack of symmetry between the read and write paths to a future patch. Reviewed-by: Kai Huang <kai.huang@intel.com> Link: https://lore.kernel.org/r/20230511233351.635053-7-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/mtrr.c | 19 ++++++------------- arch/x86/kvm/x86.c | 13 +++++++++++++ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index dc213b940141..cdbbb511f940 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -55,7 +55,6 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRfix4K_F0000: case MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: - case MSR_IA32_CR_PAT: return true; } return false; @@ -74,9 +73,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!msr_mtrr_valid(msr)) return false; - if (msr == MSR_IA32_CR_PAT) { - return kvm_pat_valid(data); - } else if (msr == MSR_MTRRdefType) { + if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; return valid_mtrr_type(data & 0xff); @@ -324,8 +321,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end; - if (msr == MSR_IA32_CR_PAT || !tdp_enabled || - !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) return; if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) @@ -392,8 +388,6 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data; else if (msr == MSR_MTRRdefType) vcpu->arch.mtrr_state.deftype = data; - else if (msr == MSR_IA32_CR_PAT) - vcpu->arch.pat = data; else set_var_mtrr_msr(vcpu, msr, data); @@ -421,13 +415,12 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 1; index = fixed_msr_to_range_index(msr); - if (index >= 0) + if (index >= 0) { *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index]; - else if (msr == MSR_MTRRdefType) + } else if (msr == MSR_MTRRdefType) { *pdata = vcpu->arch.mtrr_state.deftype; - else if (msr == MSR_IA32_CR_PAT) - *pdata = vcpu->arch.pat; - else { /* Variable MTRRs */ + } else { + /* Variable MTRRs */ if (is_mtrr_base_msr(msr)) *pdata = var_mtrr_msr_to_range(vcpu, msr)->base; else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8017e0eb1e67..902c9935979f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3703,6 +3703,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_CR_PAT: + /* + * Writes to PAT should be handled by vendor code as both SVM + * and VMX track the guest's PAT in the VMCB/VMCS. + */ + WARN_ON_ONCE(1); + + if (!kvm_pat_valid(data)) + return 1; + + vcpu->arch.pat = data; + break; case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); @@ -4112,6 +4123,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_IA32_CR_PAT: + msr_info->data = vcpu->arch.pat; + break; case MSR_MTRRcap: case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: -- cgit v1.2.3 From 3a5f49078eb5106f9b918066908508c3044b5ada Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:50 -0700 Subject: KVM: x86: Make kvm_mtrr_valid() static now that there are no external users Make kvm_mtrr_valid() local to mtrr.c now that it's not used to check the validity of a PAT MSR value. Reviewed-by: Kai Huang <kai.huang@intel.com> Link: https://lore.kernel.org/r/20230511233351.635053-8-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/mtrr.c | 3 +-- arch/x86/kvm/x86.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index cdbbb511f940..3eb6e7f47e96 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -65,7 +65,7 @@ static bool valid_mtrr_type(unsigned t) return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ } -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int i; u64 mask; @@ -100,7 +100,6 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return (data & mask) == 0; } -EXPORT_SYMBOL_GPL(kvm_mtrr_valid); static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c544602d07a3..82e3dafc5453 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -309,7 +309,6 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, -- cgit v1.2.3 From dee321977a230802a5065af4ad4f4f5e8558a738 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:51 -0700 Subject: KVM: x86: Move common handling of PAT MSR writes to kvm_set_msr_common() Move the common check-and-set handling of PAT MSR writes out of vendor code and into kvm_set_msr_common(). This aligns writes with reads, which are already handled in common code, i.e. makes the handling of reads and writes symmetrical in common code. Alternatively, the common handling in kvm_get_msr_common() could be moved to vendor code, but duplicating code is generally undesirable (even though the duplicatated code is trivial in this case), and guest writes to PAT should be rare, i.e. the overhead of the extra function call is a non-issue in practice. Suggested-by: Kai Huang <kai.huang@intel.com> Reviewed-by: Kai Huang <kai.huang@intel.com> Link: https://lore.kernel.org/r/20230511233351.635053-9-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/svm/svm.c | 7 ++++--- arch/x86/kvm/vmx/vmx.c | 7 +++---- arch/x86/kvm/x86.c | 6 ------ 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e1e17c7dc7d5..924609d63f8a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2939,9 +2939,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_CR_PAT: - if (!kvm_pat_valid(data)) - return 1; - vcpu->arch.pat = data; + ret = kvm_set_msr_common(vcpu, msr); + if (ret) + break; + svm->vmcb01.ptr->save.g_pat = data; if (is_guest_mode(vcpu)) nested_vmcb02_compute_g_pat(svm); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 33b8625d3541..2d9d155691a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2287,10 +2287,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; goto find_uret_msr; case MSR_IA32_CR_PAT: - if (!kvm_pat_valid(data)) - return 1; - - vcpu->arch.pat = data; + ret = kvm_set_msr_common(vcpu, msr_info); + if (ret) + break; if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 902c9935979f..5ad55ef71433 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3703,12 +3703,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_CR_PAT: - /* - * Writes to PAT should be handled by vendor code as both SVM - * and VMX track the guest's PAT in the VMCB/VMCS. - */ - WARN_ON_ONCE(1); - if (!kvm_pat_valid(data)) return 1; -- cgit v1.2.3 From 0d42522bdee70b9197be63dd76c9f6297cd1e832 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Wed, 19 Apr 2023 10:19:25 +0800 Subject: KVM: x86: Fix poll command According to the hardware manual, when the Poll command is issued, the byte returned by the I/O read is 1 in Bit 7 when there is an interrupt, and the highest priority binary code in Bits 2:0. The current pic simulation code is not implemented strictly according to the above expression. Fix the implementation of pic_poll_read(), set Bit 7 when there is an interrupt. Signed-off-by: Jinliang Zheng <alexjlzheng@tencent.com> Link: https://lore.kernel.org/r/20230419021924.1342184-1-alexjlzheng@tencent.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/i8259.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4756bcb5724f..8dec646e764b 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -411,7 +411,10 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) pic_clear_isr(s, ret); if (addr1 >> 7 || ret != 2) pic_update_irq(s->pics_state); + /* Bit 7 is 1, means there's an interrupt */ + ret |= 0x80; } else { + /* Bit 7 is 0, means there's no interrupt */ ret = 0x07; pic_update_irq(s->pics_state); } -- cgit v1.2.3 From ab322c43cce97ff6d05439c9b72bf1513e3e1020 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 26 May 2023 14:03:39 -0700 Subject: KVM: x86: Update number of entries for KVM_GET_CPUID2 on success, not failure Update cpuid->nent if and only if kvm_vcpu_ioctl_get_cpuid2() succeeds. The sole caller copies @cpuid to userspace only on success, i.e. the existing code effectively does nothing. Arguably, KVM should report the number of entries when returning -E2BIG so that userspace doesn't have to guess the size, but all other similar KVM ioctls() don't report the size either, i.e. userspace is conditioned to guess. Suggested-by: Takahiro Itazuri <itazur@amazon.com> Link: https://lore.kernel.org/all/20230410141820.57328-1-itazur@amazon.com Link: https://lore.kernel.org/r/20230526210340.2799158-2-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/cpuid.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0c9660a07b23..241f554f1764 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -501,20 +501,15 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { - int r; - - r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) - goto out; - r = -EFAULT; + return -E2BIG; + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - return 0; + return -EFAULT; -out: cpuid->nent = vcpu->arch.cpuid_nent; - return r; + return 0; } /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ -- cgit v1.2.3 From 2c76131319982f9c9410bc12127ac1df4e810b87 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 26 May 2023 14:03:40 -0700 Subject: KVM: selftests: Extend cpuid_test to verify KVM_GET_CPUID2 "nent" updates Verify that KVM reports the actual number of CPUID entries on success, but doesn't touch the userspace struct on failure (which for better or worse, is KVM's ABI). Link: https://lore.kernel.org/r/20230526210340.2799158-3-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 2fc3ad9c887e..d3c3aa93f090 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -163,6 +163,25 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu) ent->eax = eax; } +static void test_get_cpuid2(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent + 1); + int i, r; + + vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid); + TEST_ASSERT(cpuid->nent == vcpu->cpuid->nent, + "KVM didn't update nent on success, wanted %u, got %u\n", + vcpu->cpuid->nent, cpuid->nent); + + for (i = 0; i < vcpu->cpuid->nent; i++) { + cpuid->nent = i; + r = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid); + TEST_ASSERT(r && errno == E2BIG, KVM_IOCTL_ERROR(KVM_GET_CPUID2, r)); + TEST_ASSERT(cpuid->nent == i, "KVM modified nent on failure"); + } + free(cpuid); +} + int main(void) { struct kvm_vcpu *vcpu; @@ -183,5 +202,7 @@ int main(void) set_cpuid_after_run(vcpu); + test_get_cpuid2(vcpu); + kvm_vm_free(vm); } -- cgit v1.2.3 From 06b66e050095db599f4f598ee627d1dc9c933018 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Thu, 18 May 2023 17:13:39 +0800 Subject: KVM: x86: Fix a typo in Documentation/virt/kvm/x86/mmu.rst L1 CR4.LA57 should be '0' instead of '1' when shadowing 5-level NPT for 4-level NPT L1 guest. Signed-off-by: Binbin Wu <binbin.wu@linux.intel.com> Link: https://lore.kernel.org/r/20230518091339.1102-4-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- Documentation/virt/kvm/x86/mmu.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/x86/mmu.rst b/Documentation/virt/kvm/x86/mmu.rst index 8364afa228ec..26f62034b6f3 100644 --- a/Documentation/virt/kvm/x86/mmu.rst +++ b/Documentation/virt/kvm/x86/mmu.rst @@ -205,7 +205,7 @@ Shadow pages contain the following information: role.passthrough: The page is not backed by a guest page table, but its first entry points to one. This is set if NPT uses 5-level page tables (host - CR4.LA57=1) and is shadowing L1's 4-level NPT (L1 CR4.LA57=1). + CR4.LA57=1) and is shadowing L1's 4-level NPT (L1 CR4.LA57=0). gfn: Either the guest page table containing the translations shadowed by this page, or the base page frame for linear translations. See role.direct. -- cgit v1.2.3 From 02f1b0b736606f9870595b3089d9c124f9da8be9 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 24 May 2023 14:16:32 +0800 Subject: KVM: x86: Correct the name for skipping VMENTER l1d flush There is no VMENTER_L1D_FLUSH_NESTED_VM. It should be ARCH_CAP_SKIP_VMENTRY_L1DFLUSH. Signed-off-by: Chao Gao <chao.gao@intel.com> Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com> Link: https://lore.kernel.org/r/20230524061634.54141-3-chao.gao@intel.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5ad55ef71433..7c7be4815eaa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1631,7 +1631,7 @@ static u64 kvm_get_arch_capabilities(void) * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us - * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that + * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that * capability to the guest too, and if EPT is disabled we're not * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will * require a nested hypervisor to do a flush of its own. -- cgit v1.2.3 From 056b9919a16aedc560e6756a235ef5a62a68db05 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 18:05:50 -0700 Subject: KVM: x86: Use cpu_feature_enabled() for PKU instead of #ifdef Replace an #ifdef on CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS with a cpu_feature_enabled() check on X86_FEATURE_PKU. The macro magic of DISABLED_MASK_BIT_SET() means that cpu_feature_enabled() provides the same end result (no code generated) when PKU is disabled by Kconfig. No functional change intended. Cc: Jon Kohler <jon@nutanix.com> Reviewed-by: Jon Kohler <jon@nutanix.com> Link: https://lore.kernel.org/r/20230602010550.785722-1-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/x86.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7c7be4815eaa..6f461afed800 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1017,13 +1017,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (static_cpu_has(X86_FEATURE_PKU) && + if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) write_pkru(vcpu->arch.pkru); -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); @@ -1032,15 +1030,13 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return; -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (static_cpu_has(X86_FEATURE_PKU) && + if (cpu_feature_enabled(X86_FEATURE_PKU) && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { -- cgit v1.2.3 From e12fa4b92a07f4274ffa02e581d059a7869dd50e Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 5 Jun 2023 22:01:21 +0200 Subject: KVM: x86: Clean up: remove redundant bool conversions As test_bit() returns bool, explicitly converting result to bool is unnecessary. Get rid of '!!'. No functional change intended. Suggested-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Michal Luczaj <mhal@rbox.co> Link: https://lore.kernel.org/r/20230605200158.118109-1-mhal@rbox.co Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 924609d63f8a..02183ebc0e16 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -752,7 +752,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) BUG_ON(offset == MSR_INVALID); - return !!test_bit(bit_write, &tmp); + return test_bit(bit_write, &tmp); } static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f461afed800..355f3df8dbad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1805,7 +1805,7 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - allowed = !!test_bit(index - start, bitmap); + allowed = test_bit(index - start, bitmap); break; } } -- cgit v1.2.3 From a30642570855faa1992f333ce5cb60351b0a37da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 6 Jun 2023 17:46:36 -0700 Subject: KVM: x86: Update comments about MSR lists exposed to userspace Refresh comments about msrs_to_save, emulated_msrs, and msr_based_features to remove stale references left behind by commit 2374b7310b66 (KVM: x86/pmu: Use separate array for defining "PMU MSRs to save"), and to better reflect the current reality, e.g. emulated_msrs is no longer just for MSRs that are "kvm-specific". Reported-by: Binbin Wu <binbin.wu@linux.intel.com> Link: https://lore.kernel.org/r/20230607004636.1421424-1-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/x86.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 355f3df8dbad..07b0f960de07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1423,15 +1423,14 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) - * extract the supported MSRs from the related const lists. - * msrs_to_save is selected from the msrs_to_save_all to reflect the - * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs - * may depend on host virtualization features rather than host cpu features. + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds + * MSRs that KVM emulates without strictly requiring host support. + * msr_based_features holds MSRs that enumerate features, i.e. are effectively + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with + * msrs_to_save and emulated_msrs. */ static const u32 msrs_to_save_base[] = { @@ -1527,11 +1526,11 @@ static const u32 emulated_msrs_all[] = { MSR_IA32_UCODE_REV, /* - * The following list leaves out MSRs whose values are determined - * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. - * We always support the "true" VMX control MSRs, even if the host - * processor does not, so I am putting these registers here rather - * than in msrs_to_save_all. + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, -- cgit v1.2.3 From fb1273635f8c38df18483ffcd038a5b792dfcc94 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 16 Jun 2023 18:02:33 +0300 Subject: KVM: x86: Remove PRIx* definitions as they are solely for user space In the Linux kernel we do not support PRI.64 specifiers. Moreover they seem not to be used anyway here. Drop them. Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Link: https://lore.kernel.org/r/20230616150233.83813-1-andriy.shevchenko@linux.intel.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- arch/x86/kvm/lapic.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e542cf285b51..58ee570db395 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -51,11 +51,6 @@ #define mod_64(x, y) ((x) % (y)) #endif -#define PRId64 "d" -#define PRIx64 "llx" -#define PRIu64 "u" -#define PRIo64 "o" - /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) -- cgit v1.2.3 From b7dac767c9356fe94fe345f907adf573cf745d8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Apr 2023 10:16:50 -0700 Subject: Documentation/process: Add a label for the tip tree handbook's coding style Add a label for the tip tree's "Coding style notes" so that a forthcoming KVM x86 handbook can reference/piggyback the tip tree's preferred coding style. Link: https://lore.kernel.org/r/20230411171651.1067966-2-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- Documentation/process/maintainer-tip.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst index 178c95fd17dc..0cac75a86372 100644 --- a/Documentation/process/maintainer-tip.rst +++ b/Documentation/process/maintainer-tip.rst @@ -452,6 +452,8 @@ and can be added to an existing kernel config by running: Some of these options are x86-specific and can be left out when testing on other architectures. +.. _maintainer-tip-coding-style: + Coding style notes ------------------ -- cgit v1.2.3 From 63e2f55cabedf8a7ede928f7cf3ab028af44b9e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Apr 2023 10:16:51 -0700 Subject: Documentation/process: Add a maintainer handbook for KVM x86 Add a KVM x86 doc to the subsystem/maintainer handbook section to explain how KVM x86 (currently) operates as a sub-subsystem, and to soapbox on the rules and expectations for contributing to KVM x86. Reviewed-by: Like Xu <likexu@tencent.com> Link: https://lore.kernel.org/r/20230411171651.1067966-3-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com> --- Documentation/process/maintainer-handbooks.rst | 1 + Documentation/process/maintainer-kvm-x86.rst | 390 +++++++++++++++++++++++++ MAINTAINERS | 1 + 3 files changed, 392 insertions(+) create mode 100644 Documentation/process/maintainer-kvm-x86.rst diff --git a/Documentation/process/maintainer-handbooks.rst b/Documentation/process/maintainer-handbooks.rst index d783060b4cc6..d12cbbe2b7df 100644 --- a/Documentation/process/maintainer-handbooks.rst +++ b/Documentation/process/maintainer-handbooks.rst @@ -17,3 +17,4 @@ Contents: maintainer-tip maintainer-netdev + maintainer-kvm-x86 diff --git a/Documentation/process/maintainer-kvm-x86.rst b/Documentation/process/maintainer-kvm-x86.rst new file mode 100644 index 000000000000..9183bd449762 --- /dev/null +++ b/Documentation/process/maintainer-kvm-x86.rst @@ -0,0 +1,390 @@ +.. SPDX-License-Identifier: GPL-2.0 + +KVM x86 +======= + +Foreword +-------- +KVM strives to be a welcoming community; contributions from newcomers are +valued and encouraged. Please do not be discouraged or intimidated by the +length of this document and the many rules/guidelines it contains. Everyone +makes mistakes, and everyone was a newbie at some point. So long as you make +an honest effort to follow KVM x86's guidelines, are receptive to feedback, +and learn from any mistakes you make, you will be welcomed with open arms, not +torches and pitchforks. + +TL;DR +----- +Testing is mandatory. Be consistent with established styles and patterns. + +Trees +----- +KVM x86 is currently in a transition period from being part of the main KVM +tree, to being "just another KVM arch". As such, KVM x86 is split across the +main KVM tree, ``git.kernel.org/pub/scm/virt/kvm/kvm.git``, and a KVM x86 +specific tree, ``github.com/kvm-x86/linux.git``. + +Generally speaking, fixes for the current cycle are applied directly to the +main KVM tree, while all development for the next cycle is routed through the +KVM x86 tree. In the unlikely event that a fix for the current cycle is routed +through the KVM x86 tree, it will be applied to the ``fixes`` branch before +making its way to the main KVM tree. + +Note, this transition period is expected to last quite some time, i.e. will be +the status quo for the foreseeable future. + +Branches +~~~~~~~~ +The KVM x86 tree is organized into multiple topic branches. The purpose of +using finer-grained topic branches is to make it easier to keep tabs on an area +of development, and to limit the collateral damage of human errors and/or buggy +commits, e.g. dropping the HEAD commit of a topic branch has no impact on other +in-flight commits' SHA1 hashes, and having to reject a pull request due to bugs +delays only that topic branch. + +All topic branches, except for ``next`` and ``fixes``, are rolled into ``next`` +via a Cthulhu merge on an as-needed basis, i.e. when a topic branch is updated. +As a result, force pushes to ``next`` are common. + +Lifecycle +~~~~~~~~~ +Fixes that target the current release, a.k.a. mainline, are typically applied +directly to the main KVM tree, i.e. do not route through the KVM x86 tree. + +Changes that target the next release are routed through the KVM x86 tree. Pull +requests (from KVM x86 to main KVM) are sent for each KVM x86 topic branch, +typically the week before Linus' opening of the merge window, e.g. the week +following rc7 for "normal" releases. If all goes well, the topic branches are +rolled into the main KVM pull request sent during Linus' merge window. + +The KVM x86 tree doesn't have its own official merge window, but there's a soft +close around rc5 for new features, and a soft close around rc6 for fixes (for +the next release; see above for fixes that target the current release). + +Timeline +~~~~~~~~ +Submissions are typically reviewed and applied in FIFO order, with some wiggle +room for the size of a series, patches that are "cache hot", etc. Fixes, +especially for the current release and or stable trees, get to jump the queue. +Patches that will be taken through a non-KVM tree (most often through the tip +tree) and/or have other acks/reviews also jump the queue to some extent. + +Note, the vast majority of review is done between rc1 and rc6, give or take. +The period between rc6 and the next rc1 is used to catch up on other tasks, +i.e. radio silence during this period isn't unusual. + +Pings to get a status update are welcome, but keep in mind the timing of the +current release cycle and have realistic expectations. If you are pinging for +acceptance, i.e. not just for feedback or an update, please do everything you +can, within reason, to ensure that your patches are ready to be merged! Pings +on series that break the build or fail tests lead to unhappy maintainers! + +Development +----------- + +Base Tree/Branch +~~~~~~~~~~~~~~~~ +Fixes that target the current release, a.k.a. mainline, should be based on +``git://git.kernel.org/pub/scm/virt/kvm/kvm.git master``. Note, fixes do not +automatically warrant inclusion in the current release. There is no singular +rule, but typically only fixes for bugs that are urgent, critical, and/or were +introduced in the current release should target the current release. + +Everything else should be based on ``kvm-x86/next``, i.e. there is no need to +select a specific topic branch as the base. If there are conflicts and/or +dependencies across topic branches, it is the maintainer's job to sort them +out. + +The only exception to using ``kvm-x86/next`` as the base is if a patch/series +is a multi-arch series, i.e. has non-trivial modifications to common KVM code +and/or has more than superficial changes to other architectures' code. Multi- +arch patch/series should instead be based on a common, stable point in KVM's +history, e.g. the release candidate upon which ``kvm-x86 next`` is based. If +you're unsure whether a patch/series is truly multi-arch, err on the side of +caution and treat it as multi-arch, i.e. use a common base. + +Coding Style +~~~~~~~~~~~~ +When it comes to style, naming, patterns, etc., consistency is the number one +priority in KVM x86. If all else fails, match what already exists. + +With a few caveats listed below, follow the tip tree maintainers' preferred +:ref:`maintainer-tip-coding-style`, as patches/series often touch both KVM and +non-KVM x86 files, i.e. draw the attention of KVM *and* tip tree maintainers. + +Using reverse fir tree, a.k.a. reverse Christmas tree or reverse XMAS tree, for +variable declarations isn't strictly required, though it is still preferred. + +Except for a handful of special snowflakes, do not use kernel-doc comments for +functions. The vast majority of "public" KVM functions aren't truly public as +they are intended only for KVM-internal consumption (there are plans to +privatize KVM's headers and exports to enforce this). + +Comments +~~~~~~~~ +Write comments using imperative mood and avoid pronouns. Use comments to +provide a high level overview of the code, and/or to explain why the code does +what it does. Do not reiterate what the code literally does; let the code +speak for itself. If the code itself is inscrutable, comments will not help. + +SDM and APM References +~~~~~~~~~~~~~~~~~~~~~~ +Much of KVM's code base is directly tied to architectural behavior defined in +Intel's Software Development Manual (SDM) and AMD's Architecture Programmer’s +Manual (APM). Use of "Intel's SDM" and "AMD's APM", or even just "SDM" or +"APM", without additional context is a-ok. + +Do not reference specific sections, tables, figures, etc. by number, especially +not in comments. Instead, if necessary (see below), copy-paste the relevant +snippet and reference sections/tables/figures by name. The layouts of the SDM +and APM are constantly changing, and so the numbers/labels aren't stable. + +Generally speaking, do not explicitly reference or copy-paste from the SDM or +APM in comments. With few exceptions, KVM *must* honor architectural behavior, +therefore it's implied that KVM behavior is emulating SDM and/or APM behavior. +Note, referencing the SDM/APM in changelogs to justify the change and provide +context is perfectly ok and encouraged. + +Shortlog +~~~~~~~~ +The preferred prefix format is ``KVM: <topic>:``, where ``<topic>`` is one of:: + + - x86 + - x86/mmu + - x86/pmu + - x86/xen + - selftests + - SVM + - nSVM + - VMX + - nVMX + +**DO NOT use x86/kvm!** ``x86/kvm`` is used exclusively for Linux-as-a-KVM-guest +changes, i.e. for arch/x86/kernel/kvm.c. Do not use file names or complete file +paths as the subject/shortlog prefix. + +Note, these don't align with the topics branches (the topic branches care much +more about code conflicts). + +All names are case sensitive! ``KVM: x86:`` is good, ``kvm: vmx:`` is not. + +Capitalize the first word of the condensed patch description, but omit ending +punctionation. E.g.:: + + KVM: x86: Fix a null pointer dereference in function_xyz() + +not:: + + kvm: x86: fix a null pointer dereference in function_xyz. + +If a patch touches multiple topics, traverse up the conceptual tree to find the +first common parent (which is often simply ``x86``). When in doubt, +``git log path/to/file`` should provide a reasonable hint. + +New topics do occasionally pop up, but please start an on-list discussion if +you want to propose introducing a new topic, i.e. don't go rogue. + +See :ref:`the_canonical_patch_format` for more information, with one amendment: +do not treat the 70-75 character limit as an absolute, hard limit. Instead, +use 75 characters as a firm-but-not-hard limit, and use 80 characters as a hard +limit. I.e. let the shortlog run a few characters over the standard limit if +you have good reason to do so. + +Changelog +~~~~~~~~~ +Most importantly, write changelogs using imperative mood and avoid pronouns. + +See :ref:`describe_changes` for more information, with one amendment: lead with +a short blurb on the actual changes, and then follow up with the context and +background. Note! This order directly conflicts with the tip tree's preferred +approach! Please follow the tip tree's preferred style when sending patches +that primarily target arch/x86 code that is _NOT_ KVM code. + +Stating what a patch does before diving into details is preferred by KVM x86 +for several reasons. First and foremost, what code is actually being changed +is arguably the most important information, and so that info should be easy to +find. Changelogs that bury the "what's actually changing" in a one-liner after +3+ paragraphs of background make it very hard to find that information. + +For initial review, one could argue the "what's broken" is more important, but +for skimming logs and git archaeology, the gory details matter less and less. +E.g. when doing a series of "git blame", the details of each change along the +way are useless, the details only matter for the culprit. Providing the "what +changed" makes it easy to quickly determine whether or not a commit might be of +interest. + +Another benefit of stating "what's changing" first is that it's almost always +possible to state "what's changing" in a single sentence. Conversely, all but +the most simple bugs require multiple sentences or paragraphs to fully describe +the problem. If both the "what's changing" and "what's the bug" are super +short then the order doesn't matter. But if one is shorter (almost always the +"what's changing), then covering the shorter one first is advantageous because +it's less of an inconvenience for readers/reviewers that have a strict ordering +preference. E.g. having to skip one sentence to get to the context is less +painful than having to skip three paragraphs to get to "what's changing". + +Fixes +~~~~~ +If a change fixes a KVM/kernel bug, add a Fixes: tag even if the change doesn't +need to be backported to stable kernels, and even if the change fixes a bug in +an older release. + +Conversely, if a fix does need to be backported, explicitly tag the patch with +"Cc: stable@vger.kernel" (though the email itself doesn't need to Cc: stable); +KVM x86 opts out of backporting Fixes: by default. Some auto-selected patches +do get backported, but require explicit maintainer approval (search MANUALSEL). + +Function References +~~~~~~~~~~~~~~~~~~~ +When a function is mentioned in a comment, changelog, or shortlog (or anywhere +for that matter), use the format ``function_name()``. The parentheses provide +context and disambiguate the reference. + +Testing +------- +At a bare minimum, *all* patches in a series must build cleanly for KVM_INTEL=m +KVM_AMD=m, and KVM_WERROR=y. Building every possible combination of Kconfigs +isn't feasible, but the more the merrier. KVM_SMM, KVM_XEN, PROVE_LOCKING, and +X86_64 are particularly interesting knobs to turn. + +Running KVM selftests and KVM-unit-tests is also mandatory (and stating the +obvious, the tests need to pass). The only exception is for changes that have +negligible probability of affecting runtime behavior, e.g. patches that only +modify comments. When possible and relevant, testing on both Intel and AMD is +strongly preferred. Booting an actual VM is encouraged, but not mandatory. + +For changes that touch KVM's shadow paging code, running with TDP (EPT/NPT) +disabled is mandatory. For changes that affect common KVM MMU code, running +with TDP disabled is strongly encouraged. For all other changes, if the code +being modified depends on and/or interacts with a module param, testing with +the relevant settings is mandatory. + +Note, KVM selftests and KVM-unit-tests do have known failures. If you suspect +a failure is not due to your changes, verify that the *exact same* failure +occurs with and without your changes. + +Changes that touch reStructured Text documentation, i.e. .rst files, must build +htmldocs cleanly, i.e. with no new warnings or errors. + +If you can't fully test a change, e.g. due to lack of hardware, clearly state +what level of testing you were able to do, e.g. in the cover letter. + +New Features +~~~~~~~~~~~~ +With one exception, new features *must* come with test coverage. KVM specific +tests aren't strictly required, e.g. if coverage is provided by running a +sufficiently enabled guest VM, or by running a related kernel selftest in a VM, +but dedicated KVM tests are preferred in all cases. Negative testcases in +particular are mandatory for enabling of new hardware features as error and +exception flows are rarely exercised simply by running a VM. + +The only exception to this rule is if KVM is simply advertising support for a +feature via KVM_GET_SUPPORTED_CPUID, i.e. for instructions/features that KVM +can't prevent a guest from using and for which there is no true enabling. + +Note, "new features" does not just mean "new hardware features"! New features +that can't be well validated using existing KVM selftests and/or KVM-unit-tests +must come with tests. + +Posting new feature development without tests to get early feedback is more +than welcome, but such submissions should be tagged RFC, and the cover letter +should clearly state what type of feedback is requested/expected. Do not abuse +the RFC process; RFCs will typically not receive in-depth review. + +Bug Fixes +~~~~~~~~~ +Except for "obvious" found-by-inspection bugs, fixes must be accompanied by a +reproducer for the bug being fixed. In many cases the reproducer is implicit, +e.g. for build errors and test failures, but it should still be clear to +readers what is broken and how to verify the fix. Some leeway is given for +bugs that are found via non-public workloads/tests, but providing regression +tests for such bugs is strongly preferred. + +In general, regression tests are preferred for any bug that is not trivial to +hit. E.g. even if the bug was originally found by a fuzzer such as syzkaller, +a targeted regression test may be warranted if the bug requires hitting a +one-in-a-million type race condition. + +Note, KVM bugs are rarely urgent *and* non-trivial to reproduce. Ask yourself +if a bug is really truly the end of the world before posting a fix without a +reproducer. + +Posting +------- + +Links +~~~~~ +Do not explicitly reference bug reports, prior versions of a patch/series, etc. +via ``In-Reply-To:`` headers. Using ``In-Reply-To:`` becomes an unholy mess +for large series and/or when the version count gets high, and ``In-Reply-To:`` +is useless for anyone that doesn't have the original message, e.g. if someone +wasn't Cc'd on the bug report or if the list of recipients changes between +versions. + +To link to a bug report, previous version, or anything of interest, use lore +links. For referencing previous version(s), generally speaking do not include +a Link: in the changelog as there is no need to record the history in git, i.e. +put the link in the cover letter or in the section git ignores. Do provide a +formal Link: for bug reports and/or discussions that led to the patch. The +context of why a change was made is highly valuable for future readers. + +Git Base +~~~~~~~~ +If you are using git version 2.9.0 or later (Googlers, this is all of you!), +use ``git format-patch`` with the ``--base`` flag to automatically include the +base tree information in the generated patches. + +Note, ``--base=auto`` works as expected if and only if a branch's upstream is +set to the base topic branch, e.g. it will do the wrong thing if your upstream +is set to your personal repository for backup purposes. An alternative "auto" +solution is to derive the names of your development branches based on their +KVM x86 topic, and feed that into ``--base``. E.g. ``x86/pmu/my_branch_name``, +and then write a small wrapper to extract ``pmu`` from the current branch name +to yield ``--base=x/pmu``, where ``x`` is whatever name your repository uses to +track the KVM x86 remote. + +Co-Posting Tests +~~~~~~~~~~~~~~~~ +KVM selftests that are associated with KVM changes, e.g. regression tests for +bug fixes, should be posted along with the KVM changes as a single series. The +standard kernel rules for bisection apply, i.e. KVM changes that result in test +failures should be ordered after the selftests updates, and vice versa, new +tests that fail due to KVM bugs should be ordered after the KVM fixes. + +KVM-unit-tests should *always* be posted separately. Tools, e.g. b4 am, don't +know that KVM-unit-tests is a separate repository and get confused when patches +in a series apply on different trees. To tie KVM-unit-tests patches back to +KVM patches, first post the KVM changes and then provide a lore Link: to the +KVM patch/series in the KVM-unit-tests patch(es). + +Notifications +------------- +When a patch/series is officially accepted, a notification email will be sent +in reply to the original posting (cover letter for multi-patch series). The +notification will include the tree and topic branch, along with the SHA1s of +the commits of applied patches. + +If a subset of patches is applied, this will be clearly stated in the +notification. Unless stated otherwise, it's implied that any patches in the +series that were not accepted need more work and should be submitted in a new +version. + +If for some reason a patch is dropped after officially being accepted, a reply +will be sent to the notification email explaining why the patch was dropped, as +well as the next steps. + +SHA1 Stability +~~~~~~~~~~~~~~ +SHA1s are not 100% guaranteed to be stable until they land in Linus' tree! A +SHA1 is *usually* stable once a notification has been sent, but things happen. +In most cases, an update to the notification email be provided if an applied +patch's SHA1 changes. However, in some scenarios, e.g. if all KVM x86 branches +need to be rebased, individual notifications will not be given. + +Vulnerabilities +--------------- +Bugs that can be exploited by the guest to attack the host (kernel or +userspace), or that can be exploited by a nested VM to *its* host (L2 attacking +L1), are of particular interest to KVM. Please follow the protocol for +:ref:`securitybugs` if you suspect a bug can lead to an escape, data leak, etc. + diff --git a/MAINTAINERS b/MAINTAINERS index e0ad886d3163..d682bfa2b291 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11420,6 +11420,7 @@ M: Sean Christopherson <seanjc@google.com> M: Paolo Bonzini <pbonzini@redhat.com> L: kvm@vger.kernel.org S: Supported +P: Documentation/process/maintainer-kvm-x86.rst T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git F: arch/x86/include/asm/kvm* F: arch/x86/include/asm/svm.h -- cgit v1.2.3