aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorBorislav Petkov2020-12-01 18:01:27 +0100
committerBorislav Petkov2020-12-01 18:01:27 +0100
commit87314fb181f9042a226d721ab4a5579ddfca139c (patch)
treeb663e58cfe19ecf5003d4ea6e8048f6ae1925e9f /arch/x86
parent2002d2951398317d0f46e64ae6d8dd58ed541c6d (diff)
parentb65054597872ce3aefbc6a666385eabdf9e288da (diff)
Merge tag 'v5.10-rc6' into x86/cache
Merge -rc6 tag to pick up dependent changes. Signed-off-by: Borislav Petkov <bp@suse.de>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c1
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S20
-rw-r--r--arch/x86/boot/compressed/misc.h2
-rw-r--r--arch/x86/crypto/poly1305_glue.c1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl10
-rw-r--r--arch/x86/events/intel/core.c12
-rw-r--r--arch/x86/events/intel/cstate.c6
-rw-r--r--arch/x86/events/intel/ds.c53
-rw-r--r--arch/x86/events/intel/uncore.c4
-rw-r--r--arch/x86/events/intel/uncore.h12
-rw-r--r--arch/x86/events/intel/uncore_snb.c2
-rw-r--r--arch/x86/events/perf_event.h3
-rw-r--r--arch/x86/events/rapl.c14
-rw-r--r--arch/x86/hyperv/hv_apic.c14
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/mwait.h2
-rw-r--r--arch/x86/include/asm/perf_event.h4
-rw-r--r--arch/x86/include/asm/sparsemem.h10
-rw-r--r--arch/x86/include/asm/uv/uv.h10
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h1
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c29
-rw-r--r--arch/x86/kernel/cpu/bugs.c55
-rw-r--r--arch/x86/kernel/cpu/mce/core.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c63
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c65
-rw-r--r--arch/x86/kernel/dumpstack.c23
-rw-r--r--arch/x86/kernel/head_64.S16
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c3
-rw-r--r--arch/x86/kernel/perf_regs.c15
-rw-r--r--arch/x86/kernel/process.c12
-rw-r--r--arch/x86/kernel/sev-es-shared.c26
-rw-r--r--arch/x86/kernel/sev-es.c20
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S89
-rw-r--r--arch/x86/kernel/tboot.c8
-rw-r--r--arch/x86/kernel/traps.c43
-rw-r--r--arch/x86/kernel/unwind_orc.c9
-rw-r--r--arch/x86/kvm/cpuid.c29
-rw-r--r--arch/x86/kvm/cpuid.h1
-rw-r--r--arch/x86/kvm/emulate.c8
-rw-r--r--arch/x86/kvm/irq.c85
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c24
-rw-r--r--arch/x86/kvm/mmu/spte.c16
-rw-r--r--arch/x86/kvm/mmu/spte.h16
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c7
-rw-r--r--arch/x86/kvm/svm/sev.c2
-rw-r--r--arch/x86/kvm/svm/svm.c12
-rw-r--r--arch/x86/kvm/vmx/evmcs.c3
-rw-r--r--arch/x86/kvm/vmx/evmcs.h3
-rw-r--r--arch/x86/kvm/vmx/vmx.c6
-rw-r--r--arch/x86/kvm/x86.c100
-rw-r--r--arch/x86/kvm/x86.h8
-rw-r--r--arch/x86/lib/memcpy_64.S4
-rw-r--r--arch/x86/lib/memmove_64.S4
-rw-r--r--arch/x86/lib/memset_64.S4
-rw-r--r--arch/x86/mm/mem_encrypt.c1
-rw-r--r--arch/x86/mm/numa.c2
-rw-r--r--arch/x86/platform/efi/efi_64.c24
-rw-r--r--arch/x86/um/stub_segv.c2
-rw-r--r--arch/x86/xen/spinlock.c12
61 files changed, 652 insertions, 397 deletions
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index a5e5db6ada3c..39b2eded7bc2 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -164,6 +164,7 @@ void initialize_identity_maps(void *rmode)
add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
/* Load the new page-table. */
+ sev_verify_cbit(top_level_pgt);
write_cr3(top_level_pgt);
}
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index dd07e7b41b11..aa561795efd1 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -68,6 +68,9 @@ SYM_FUNC_START(get_sev_encryption_bit)
SYM_FUNC_END(get_sev_encryption_bit)
.code64
+
+#include "../../kernel/sev_verify_cbit.S"
+
SYM_FUNC_START(set_sev_encryption_mask)
#ifdef CONFIG_AMD_MEM_ENCRYPT
push %rbp
@@ -81,6 +84,19 @@ SYM_FUNC_START(set_sev_encryption_mask)
bts %rax, sme_me_mask(%rip) /* Create the encryption mask */
+ /*
+ * Read MSR_AMD64_SEV again and store it to sev_status. Can't do this in
+ * get_sev_encryption_bit() because this function is 32-bit code and
+ * shared between 64-bit and 32-bit boot path.
+ */
+ movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */
+ rdmsr
+
+ /* Store MSR value in sev_status */
+ shlq $32, %rdx
+ orq %rdx, %rax
+ movq %rax, sev_status(%rip)
+
.Lno_sev_mask:
movq %rbp, %rsp /* Restore original stack pointer */
@@ -96,5 +112,7 @@ SYM_FUNC_END(set_sev_encryption_mask)
#ifdef CONFIG_AMD_MEM_ENCRYPT
.balign 8
-SYM_DATA(sme_me_mask, .quad 0)
+SYM_DATA(sme_me_mask, .quad 0)
+SYM_DATA(sev_status, .quad 0)
+SYM_DATA(sev_check_data, .quad 0)
#endif
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 6d31f1b4c4d1..d9a631c5973c 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -159,4 +159,6 @@ void boot_page_fault(void);
void boot_stage1_vc(void);
void boot_stage2_vc(void);
+unsigned long sev_verify_cbit(unsigned long cr3);
+
#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index e508dbd91813..c44aba290fbb 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -158,6 +158,7 @@ static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
dctx->s[1] = get_unaligned_le32(&inp[4]);
dctx->s[2] = get_unaligned_le32(&inp[8]);
dctx->s[3] = get_unaligned_le32(&inp[12]);
+ acc += POLY1305_BLOCK_SIZE;
dctx->sset = true;
}
}
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1f47e24fb65c..379819244b91 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -364,10 +364,10 @@
440 common process_madvise sys_process_madvise
#
-# x32-specific system call numbers start at 512 to avoid cache impact
-# for native 64-bit operation. The __x32_compat_sys stubs are created
-# on-the-fly for compat_sys_*() compatibility system calls if X86_X32
-# is defined.
+# Due to a historical design error, certain syscalls are numbered differently
+# in x32 as compared to native x86_64. These syscalls have numbers 512-547.
+# Do not add new syscalls to this range. Numbers 548 and above are available
+# for non-x32 use.
#
512 x32 rt_sigaction compat_sys_rt_sigaction
513 x32 rt_sigreturn compat_sys_x32_rt_sigreturn
@@ -405,3 +405,5 @@
545 x32 execveat compat_sys_execveat
546 x32 preadv2 compat_sys_preadv64v2
547 x32 pwritev2 compat_sys_pwritev64v2
+# This is the end of the legacy x32 range. Numbers 548 and above are
+# not special and are not to be used for x32-specific syscalls.
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f1926e9f2143..af457f8cb29d 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2630,7 +2630,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
u64 pebs_enabled = cpuc->pebs_enabled;
handled++;
- x86_pmu.drain_pebs(regs);
+ x86_pmu.drain_pebs(regs, &data);
status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
/*
@@ -4987,6 +4987,12 @@ __init int intel_pmu_init(void)
x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+ if (version >= 5) {
+ x86_pmu.intel_cap.anythread_deprecated = edx.split.anythread_deprecated;
+ if (x86_pmu.intel_cap.anythread_deprecated)
+ pr_cont(" AnyThread deprecated, ");
+ }
+
/*
* Install the hw-cache-events table:
*/
@@ -5512,6 +5518,10 @@ __init int intel_pmu_init(void)
x86_pmu.intel_ctrl |=
((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+ /* AnyThread may be deprecated on arch perfmon v5 or later */
+ if (x86_pmu.intel_cap.anythread_deprecated)
+ x86_pmu.format_attrs = intel_arch_formats_attr;
+
if (x86_pmu.event_constraints) {
/*
* event on fixed counter2 (REF_CYCLES) only works on this
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 442e1ed4acd4..4eb7ee5fed72 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -107,14 +107,14 @@
MODULE_LICENSE("GPL");
#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __cstate_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
+static ssize_t __cstate_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
-static struct kobj_attribute format_attr_##_var = \
+static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
static ssize_t cstate_get_attr_cpumask(struct device *dev,
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 404315df1e16..b47cc4226934 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -642,8 +642,8 @@ int intel_pmu_drain_bts_buffer(void)
rcu_read_lock();
perf_prepare_sample(&header, &data, event, &regs);
- if (perf_output_begin(&handle, event, header.size *
- (top - base - skip)))
+ if (perf_output_begin(&handle, &data, event,
+ header.size * (top - base - skip)))
goto unlock;
for (at = base; at < top; at++) {
@@ -670,7 +670,9 @@ unlock:
static inline void intel_pmu_drain_pebs_buffer(void)
{
- x86_pmu.drain_pebs(NULL);
+ struct perf_sample_data data;
+
+ x86_pmu.drain_pebs(NULL, &data);
}
/*
@@ -1719,23 +1721,24 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
return 0;
}
-static void __intel_pmu_pebs_event(struct perf_event *event,
- struct pt_regs *iregs,
- void *base, void *top,
- int bit, int count,
- void (*setup_sample)(struct perf_event *,
- struct pt_regs *,
- void *,
- struct perf_sample_data *,
- struct pt_regs *))
+static __always_inline void
+__intel_pmu_pebs_event(struct perf_event *event,
+ struct pt_regs *iregs,
+ struct perf_sample_data *data,
+ void *base, void *top,
+ int bit, int count,
+ void (*setup_sample)(struct perf_event *,
+ struct pt_regs *,
+ void *,
+ struct perf_sample_data *,
+ struct pt_regs *))
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- struct perf_sample_data data;
struct x86_perf_regs perf_regs;
struct pt_regs *regs = &perf_regs.regs;
void *at = get_next_pebs_record_by_bit(base, top, bit);
- struct pt_regs dummy_iregs;
+ static struct pt_regs dummy_iregs;
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
/*
@@ -1752,14 +1755,14 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
iregs = &dummy_iregs;
while (count > 1) {
- setup_sample(event, iregs, at, &data, regs);
- perf_event_output(event, &data, regs);
+ setup_sample(event, iregs, at, data, regs);
+ perf_event_output(event, data, regs);
at += cpuc->pebs_record_size;
at = get_next_pebs_record_by_bit(at, top, bit);
count--;
}
- setup_sample(event, iregs, at, &data, regs);
+ setup_sample(event, iregs, at, data, regs);
if (iregs == &dummy_iregs) {
/*
* The PEBS records may be drained in the non-overflow context,
@@ -1767,18 +1770,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
* last record the same as other PEBS records, and doesn't
* invoke the generic overflow handler.
*/
- perf_event_output(event, &data, regs);
+ perf_event_output(event, data, regs);
} else {
/*
* All but the last records are processed.
* The last one is left to be able to call the overflow handler.
*/
- if (perf_event_overflow(event, &data, regs))
+ if (perf_event_overflow(event, data, regs))
x86_pmu_stop(event, 0);
}
}
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
@@ -1812,7 +1815,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
return;
}
- __intel_pmu_pebs_event(event, iregs, at, top, 0, n,
+ __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
setup_pebs_fixed_sample_data);
}
@@ -1835,7 +1838,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int
}
}
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_data *data)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
@@ -1942,14 +1945,14 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
}
if (counts[bit]) {
- __intel_pmu_pebs_event(event, iregs, base,
+ __intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_fixed_sample_data);
}
}
}
-static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs)
+static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1997,7 +2000,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs)
if (WARN_ON_ONCE(!event->attr.precise_ip))
continue;
- __intel_pmu_pebs_event(event, iregs, base,
+ __intel_pmu_pebs_event(event, iregs, data, base,
top, bit, counts[bit],
setup_pebs_adaptive_sample_data);
}
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 86d012b3e0b4..80d52cbe2fde 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -94,8 +94,8 @@ end:
return map;
}
-ssize_t uncore_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct uncore_event_desc *event =
container_of(attr, struct uncore_event_desc, attr);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 83d2a7d490e0..9efea154349d 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -157,7 +157,7 @@ struct intel_uncore_box {
#define UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS 2
struct uncore_event_desc {
- struct kobj_attribute attr;
+ struct device_attribute attr;
const char *config;
};
@@ -179,8 +179,8 @@ struct pci2phy_map {
struct pci2phy_map *__find_pci2phy_map(int segment);
int uncore_pcibus_to_physid(struct pci_bus *bus);
-ssize_t uncore_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf);
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf);
static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
{
@@ -201,14 +201,14 @@ extern int __uncore_max_dies;
}
#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
+static ssize_t __uncore_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
-static struct kobj_attribute format_attr_##_var = \
+static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
static inline bool uncore_pmc_fixed(int idx)
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 39e632ed6ca9..bbd1120ae161 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -475,7 +475,7 @@ enum perf_snb_uncore_imc_freerunning_types {
static struct freerunning_counters snb_uncore_imc_freerunning[] = {
[SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
0x0, 0x0, 1, 32 },
- [SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE,
+ [SNB_PCI_UNCORE_IMC_DATA_WRITES] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE,
0x0, 0x0, 1, 32 },
[SNB_PCI_UNCORE_IMC_GT_REQUESTS] = { SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE,
0x0, 0x0, 1, 32 },
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ee2b9b9fc2a5..6a8edfe59b09 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -585,6 +585,7 @@ union perf_capabilities {
u64 pebs_baseline:1;
u64 perf_metrics:1;
u64 pebs_output_pt_available:1;
+ u64 anythread_deprecated:1;
};
u64 capabilities;
};
@@ -727,7 +728,7 @@ struct x86_pmu {
int pebs_record_size;
int pebs_buffer_size;
int max_pebs_events;
- void (*drain_pebs)(struct pt_regs *regs);
+ void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
unsigned long large_pebs_flags;
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 7c0120e2e957..7dbbeaacd995 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -93,18 +93,6 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
* any other bit is reserved
*/
#define RAPL_EVENT_MASK 0xFFULL
-
-#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
- char *page) \
-{ \
- BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
- return sprintf(page, _format "\n"); \
-} \
-static struct kobj_attribute format_attr_##_var = \
- __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-
#define RAPL_CNTR_WIDTH 32
#define RAPL_EVENT_ATTR_STR(_name, v, str) \
@@ -441,7 +429,7 @@ static struct attribute_group rapl_pmu_events_group = {
.attrs = attrs_empty,
};
-DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+PMU_FORMAT_ATTR(event, "config:0-7");
static struct attribute *rapl_formats_attr[] = {
&format_attr_event.attr,
NULL,
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 40e0e322161d..284e73661a18 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -273,11 +273,15 @@ void __init hv_apic_init(void)
pr_info("Hyper-V: Using enlightened APIC (%s mode)",
x2apic_enabled() ? "x2apic" : "xapic");
/*
- * With x2apic, architectural x2apic MSRs are equivalent to the
- * respective synthetic MSRs, so there's no need to override
- * the apic accessors. The only exception is
- * hv_apic_eoi_write, because it benefits from lazy EOI when
- * available, but it works for both xapic and x2apic modes.
+ * When in x2apic mode, don't use the Hyper-V specific APIC
+ * accessors since the field layout in the ICR register is
+ * different in x2apic mode. Furthermore, the architectural
+ * x2apic MSRs function just as well as the Hyper-V
+ * synthetic APIC MSRs, so there's no benefit in having
+ * separate Hyper-V accessors for x2apic mode. The only
+ * exception is hv_apic_eoi_write, because it benefits from
+ * lazy EOI when available, but the same accessor works for
+ * both xapic and x2apic because the field layout is the same.
*/
apic_set_eoi_write(hv_apic_eoi_write);
if (!x2apic_enabled()) {
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d44858b69353..7e5f33a0d0e2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -639,6 +639,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
+ unsigned long cr3_lm_rsvd_bits;
int maxphyaddr;
int max_tdp_level;
@@ -1655,6 +1656,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_extint(struct kvm_vcpu *v);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index e039a933aca3..29dd27b5a339 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -88,8 +88,6 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
- trace_hardirqs_on();
-
mds_idle_clear_cpu_buffers();
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 6960cd6d1f23..b9a7fd0a27e2 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -137,7 +137,9 @@ union cpuid10_edx {
struct {
unsigned int num_counters_fixed:5;
unsigned int bit_width_fixed:8;
- unsigned int reserved:19;
+ unsigned int reserved1:2;
+ unsigned int anythread_deprecated:1;
+ unsigned int reserved2:16;
} split;
unsigned int full;
};
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 6bfc878f6771..6a9ccc1b2be5 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -28,4 +28,14 @@
#endif
#endif /* CONFIG_SPARSEMEM */
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+extern int phys_to_target_node(phys_addr_t start);
+#define phys_to_target_node phys_to_target_node
+extern int memory_add_physaddr_to_nid(u64 start);
+#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+#endif
+#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 172d3e4a9e4b..648eb23fe7f0 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -2,14 +2,8 @@
#ifndef _ASM_X86_UV_UV_H
#define _ASM_X86_UV_UV_H
-#include <asm/tlbflush.h>
-
enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC};
-struct cpumask;
-struct mm_struct;
-struct flush_tlb_info;
-
#ifdef CONFIG_X86_UV
#include <linux/efi.h>
@@ -44,10 +38,6 @@ static inline int is_uv_system(void) { return 0; }
static inline int is_uv_hubbed(int uv) { return 0; }
static inline void uv_cpu_init(void) { }
static inline void uv_system_init(void) { }
-static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
-{ return cpumask; }
#endif /* X86_UV */
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 812e9b4c1114..950afebfba88 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -32,6 +32,7 @@
#define KVM_FEATURE_POLL_CONTROL 12
#define KVM_FEATURE_PV_SCHED_YIELD 13
#define KVM_FEATURE_ASYNC_PF_INT 14
+#define KVM_FEATURE_MSI_EXT_DEST_ID 15
#define KVM_HINTS_REALTIME 0
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4adbe65afe23..2400ad62f330 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -807,6 +807,15 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
temp_mm_state_t temp_state;
lockdep_assert_irqs_disabled();
+
+ /*
+ * Make sure not to be in TLB lazy mode, as otherwise we'll end up
+ * with a stale address space WITHOUT being in lazy mode after
+ * restoring the previous mm.
+ */
+ if (this_cpu_read(cpu_tlbstate.is_lazy))
+ leave_mm(smp_processor_id());
+
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 714233cee0b5..1b98f8c12b96 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -33,7 +33,7 @@ static union uvh_apicid uvh_apicid;
static int uv_node_id;
/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */
-static u8 uv_archtype[UV_AT_SIZE];
+static u8 uv_archtype[UV_AT_SIZE + 1];
static u8 oem_id[ACPI_OEM_ID_SIZE + 1];
static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
@@ -290,6 +290,9 @@ static void __init uv_stringify(int len, char *to, char *from)
{
/* Relies on 'to' being NULL chars so result will be NULL terminated */
strncpy(to, from, len-1);
+
+ /* Trim trailing spaces */
+ (void)strim(to);
}
/* Find UV arch type entry in UVsystab */
@@ -317,7 +320,7 @@ static int __init decode_arch_type(unsigned long ptr)
if (n > 0 && n < sizeof(uv_ate->archtype)) {
pr_info("UV: UVarchtype received from BIOS\n");
- uv_stringify(UV_AT_SIZE, uv_archtype, uv_ate->archtype);
+ uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype);
return 1;
}
return 0;
@@ -366,7 +369,7 @@ static int __init early_get_arch_type(void)
return ret;
}
-static int __init uv_set_system_type(char *_oem_id)
+static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
{
/* Save OEM_ID passed from ACPI MADT */
uv_stringify(sizeof(oem_id), oem_id, _oem_id);
@@ -375,7 +378,7 @@ static int __init uv_set_system_type(char *_oem_id)
if (!early_get_arch_type())
/* If not use OEM ID for UVarchtype */
- uv_stringify(UV_AT_SIZE, uv_archtype, _oem_id);
+ uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id);
/* Check if not hubbed */
if (strncmp(uv_archtype, "SGI", 3) != 0) {
@@ -386,13 +389,23 @@ static int __init uv_set_system_type(char *_oem_id)
/* (Not hubless), not a UV */
return 0;
+ /* Is UV hubless system */
+ uv_hubless_system = 0x01;
+
+ /* UV5 Hubless */
+ if (strncmp(uv_archtype, "NSGI5", 5) == 0)
+ uv_hubless_system |= 0x20;
+
/* UV4 Hubless: CH */
- if (strncmp(uv_archtype, "NSGI4", 5) == 0)
- uv_hubless_system = 0x11;
+ else if (strncmp(uv_archtype, "NSGI4", 5) == 0)
+ uv_hubless_system |= 0x10;
/* UV3 Hubless: UV300/MC990X w/o hub */
else
- uv_hubless_system = 0x9;
+ uv_hubless_system |= 0x8;
+
+ /* Copy APIC type */
+ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n",
oem_id, oem_table_id, uv_system_type, uv_hubless_system);
@@ -456,7 +469,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
/* If not UV, return. */
- if (likely(uv_set_system_type(_oem_id) == 0))
+ if (uv_set_system_type(_oem_id, _oem_table_id) == 0)
return 0;
/* Save and Decode OEM Table ID */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d3f0db463f96..d41b70fe4918 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -739,11 +739,13 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
if (boot_cpu_has(X86_FEATURE_IBPB)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+ spectre_v2_user_ibpb = mode;
switch (cmd) {
case SPECTRE_V2_USER_CMD_FORCE:
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
static_branch_enable(&switch_mm_always_ibpb);
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_PRCTL:
case SPECTRE_V2_USER_CMD_AUTO:
@@ -757,8 +759,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
static_key_enabled(&switch_mm_always_ibpb) ?
"always-on" : "conditional");
-
- spectre_v2_user_ibpb = mode;
}
/*
@@ -1254,6 +1254,14 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
return 0;
}
+static bool is_spec_ib_user_controlled(void)
+{
+ return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP;
+}
+
static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
{
switch (ctrl) {
@@ -1261,16 +1269,26 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return 0;
+
/*
- * Indirect branch speculation is always disabled in strict
- * mode. It can neither be enabled if it was force-disabled
- * by a previous prctl call.
+ * With strict mode for both IBPB and STIBP, the instruction
+ * code paths avoid checking this task flag and instead,
+ * unconditionally run the instruction. However, STIBP and IBPB
+ * are independent and either can be set to conditionally
+ * enabled regardless of the mode of the other.
+ *
+ * If either is set to conditional, allow the task flag to be
+ * updated, unless it was force-disabled by a previous prctl
+ * call. Currently, this is possible on an AMD CPU which has the
+ * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the
+ * kernel is booted with 'spectre_v2_user=seccomp', then
+ * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and
+ * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED.
*/
- if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+ if (!is_spec_ib_user_controlled() ||
task_spec_ib_force_disable(task))
return -EPERM;
+
task_clear_spec_ib_disable(task);
task_update_spec_tif(task);
break;
@@ -1283,10 +1301,10 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return -EPERM;
- if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+
+ if (!is_spec_ib_user_controlled())
return 0;
+
task_set_spec_ib_disable(task);
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
@@ -1351,20 +1369,17 @@ static int ib_prctl_get(struct task_struct *task)
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
return PR_SPEC_ENABLE;
- else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
- return PR_SPEC_DISABLE;
- else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
- spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) {
+ else if (is_spec_ib_user_controlled()) {
if (task_spec_ib_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
if (task_spec_ib_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- } else
+ } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+ return PR_SPEC_DISABLE;
+ else
return PR_SPEC_NOT_AFFECTED;
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4102b866e7c0..32b7099e3511 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1384,8 +1384,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
* When there's any problem use only local no_way_out state.
*/
if (!lmce) {
- if (mce_end(order) < 0)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
+ if (mce_end(order) < 0) {
+ if (!no_way_out)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+ }
} else {
/*
* If there was a fatal machine check we should have
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 6a99535d7f37..7e8e07bddd5f 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -100,53 +100,6 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev
return find_matching_signature(mc, csig, cpf);
}
-/*
- * Given CPU signature and a microcode patch, this function finds if the
- * microcode patch has matching family and model with the CPU.
- *
- * %true - if there's a match
- * %false - otherwise
- */
-static bool microcode_matches(struct microcode_header_intel *mc_header,
- unsigned long sig)
-{
- unsigned long total_size = get_totalsize(mc_header);
- unsigned long data_size = get_datasize(mc_header);
- struct extended_sigtable *ext_header;
- unsigned int fam_ucode, model_ucode;
- struct extended_signature *ext_sig;
- unsigned int fam, model;
- int ext_sigcount, i;
-
- fam = x86_family(sig);
- model = x86_model(sig);
-
- fam_ucode = x86_family(mc_header->sig);
- model_ucode = x86_model(mc_header->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return true;
-
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- return false;
-
- ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
- ext_sigcount = ext_header->count;
-
- for (i = 0; i < ext_sigcount; i++) {
- fam_ucode = x86_family(ext_sig->sig);
- model_ucode = x86_model(ext_sig->sig);
-
- if (fam == fam_ucode && model == model_ucode)
- return true;
-
- ext_sig++;
- }
- return false;
-}
-
static struct ucode_patch *memdup_patch(void *data, unsigned int size)
{
struct ucode_patch *p;
@@ -164,7 +117,7 @@ static struct ucode_patch *memdup_patch(void *data, unsigned int size)
return p;
}
-static void save_microcode_patch(void *data, unsigned int size)
+static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size)
{
struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
struct ucode_patch *iter, *tmp, *p = NULL;
@@ -210,6 +163,9 @@ static void save_microcode_patch(void *data, unsigned int size)
if (!p)
return;
+ if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
+ return;
+
/*
* Save for early loading. On 32-bit, that needs to be a physical
* address as the APs are running from physical addresses, before
@@ -344,13 +300,14 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
size -= mc_size;
- if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+ if (!find_matching_signature(data, uci->cpu_sig.sig,
+ uci->cpu_sig.pf)) {
data += mc_size;
continue;
}
if (save) {
- save_microcode_patch(data, mc_size);
+ save_microcode_patch(uci, data, mc_size);
goto next;
}
@@ -483,14 +440,14 @@ static void show_saved_mc(void)
* Save this microcode patch. It will be loaded early when a CPU is
* hot-added or resumes.
*/
-static void save_mc_for_early(u8 *mc, unsigned int size)
+static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size)
{
/* Synchronization during CPU hotplug. */
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
mutex_lock(&x86_cpu_microcode_mutex);
- save_microcode_patch(mc, size);
+ save_microcode_patch(uci, mc, size);
show_saved_mc();
mutex_unlock(&x86_cpu_microcode_mutex);
@@ -935,7 +892,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
* permanent memory. So it will be loaded early when a CPU is hot added
* or resumes.
*/
- save_mc_for_early(new_mc, new_mc_size);
+ save_mc_for_early(uci, new_mc, new_mc_size);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 78dcbb8e0172..05a026d4420b 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -507,6 +507,24 @@ unlock:
return ret ?: nbytes;
}
+/**
+ * rdtgroup_remove - the helper to remove resource group safely
+ * @rdtgrp: resource group to remove
+ *
+ * On resource group creation via a mkdir, an extra kernfs_node reference is
+ * taken to ensure that the rdtgroup structure remains accessible for the
+ * rdtgroup_kn_unlock() calls where it is removed.
+ *
+ * Drop the extra reference here, then free the rdtgroup structure.
+ *
+ * Return: void
+ */
+static void rdtgroup_remove(struct rdtgroup *rdtgrp)
+{
+ kernfs_put(rdtgrp->kn);
+ kfree(rdtgrp);
+}
+
struct task_move_callback {
struct callback_head work;
struct rdtgroup *rdtgrp;
@@ -529,7 +547,7 @@ static void move_myself(struct callback_head *head)
(rdtgrp->flags & RDT_DELETED)) {
current->closid = 0;
current->rmid = 0;
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
}
if (unlikely(current->flags & PF_EXITING))
@@ -1769,7 +1787,6 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
if (IS_ERR(kn_subdir))
return PTR_ERR(kn_subdir);
- kernfs_get(kn_subdir);
ret = rdtgroup_kn_set_ugid(kn_subdir);
if (ret)
return ret;
@@ -1792,7 +1809,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
if (IS_ERR(kn_info))
return PTR_ERR(kn_info);
- kernfs_get(kn_info);
ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
if (ret)
@@ -1813,12 +1829,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
goto out_destroy;
}
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that @rdtgrp->kn is always accessible.
- */
- kernfs_get(kn_info);
-
ret = rdtgroup_kn_set_ugid(kn_info);
if (ret)
goto out_destroy;
@@ -1847,12 +1857,6 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
if (dest_kn)
*dest_kn = kn;
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that @rdtgrp->kn is always accessible.
- */
- kernfs_get(kn);
-
ret = rdtgroup_kn_set_ugid(kn);
if (ret)
goto out_destroy;
@@ -2079,8 +2083,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
rdtgroup_pseudo_lock_remove(rdtgrp);
kernfs_unbreak_active_protection(kn);
- kernfs_put(rdtgrp->kn);
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
} else {
kernfs_unbreak_active_protection(kn);
}
@@ -2139,13 +2142,11 @@ static int rdt_get_tree(struct fs_context *fc)
&kn_mongrp);
if (ret < 0)
goto out_info;
- kernfs_get(kn_mongrp);
ret = mkdir_mondata_all(rdtgroup_default.kn,
&rdtgroup_default, &kn_mondata);
if (ret < 0)
goto out_mongrp;
- kernfs_get(kn_mondata);
rdtgroup_default.mon.mon_data_kn = kn_mondata;
}
@@ -2357,7 +2358,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
if (atomic_read(&sentry->waitcount) != 0)
sentry->flags = RDT_DELETED;
else
- kfree(sentry);
+ rdtgroup_remove(sentry);
}
}
@@ -2399,7 +2400,7 @@ static void rmdir_all_sub(void)
if (atomic_read(&rdtgrp->waitcount) != 0)
rdtgrp->flags = RDT_DELETED;
else
- kfree(rdtgrp);
+ rdtgroup_remove(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
update_closid_rmid(cpu_online_mask, &rdtgroup_default);
@@ -2499,11 +2500,6 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
if (IS_ERR(kn))
return PTR_ERR(kn);
- /*
- * This extra ref will be put in kernfs_remove() and guarantees
- * that kn is always accessible.
- */
- kernfs_get(kn);
ret = rdtgroup_kn_set_ugid(kn);
if (ret)
goto out_destroy;
@@ -2838,8 +2834,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
/*
* kernfs_remove() will drop the reference count on "kn" which
* will free it. But we still need it to stick around for the
- * rdtgroup_kn_unlock(kn} call below. Take one extra reference
- * here, which will be dropped inside rdtgroup_kn_unlock().
+ * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
+ * which will be dropped by kernfs_put() in rdtgroup_remove().
*/
kernfs_get(kn);
@@ -2880,6 +2876,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
out_idfree:
free_rmid(rdtgrp->mon.rmid);
out_destroy:
+ kernfs_put(rdtgrp->kn);
kernfs_remove(rdtgrp->kn);
out_free_rgrp:
kfree(rdtgrp);
@@ -2892,7 +2889,7 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
{
kernfs_remove(rgrp->kn);
free_rmid(rgrp->mon.rmid);
- kfree(rgrp);
+ rdtgroup_remove(rgrp);
}
/*
@@ -3049,11 +3046,6 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
list_del(&rdtgrp->mon.crdtgrp_list);
- /*
- * one extra hold on this, will drop when we kfree(rdtgrp)
- * in rdtgroup_kn_unlock()
- */
- kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
return 0;
@@ -3065,11 +3057,6 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
rdtgrp->flags = RDT_DELETED;
list_del(&rdtgrp->rdtgroup_list);
- /*
- * one extra hold on this, will drop when we kfree(rdtgrp)
- * in rdtgroup_kn_unlock()
- */
- kernfs_get(kn);
kernfs_remove(rdtgrp->kn);
return 0;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 25c06b67e7e0..97aa900386cb 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -78,6 +78,9 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
if (!user_mode(regs))
return copy_from_kernel_nofault(buf, (u8 *)src, nbytes);
+ /* The user space code from other tasks cannot be accessed. */
+ if (regs != task_pt_regs(current))
+ return -EPERM;
/*
* Make sure userspace isn't trying to trick us into dumping kernel
* memory by pointing the userspace instruction pointer at it.
@@ -85,6 +88,12 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX))
return -EINVAL;
+ /*
+ * Even if named copy_from_user_nmi() this can be invoked from
+ * other contexts and will not try to resolve a pagefault, which is
+ * the correct thing to do here as this code can be called from any
+ * context.
+ */
return copy_from_user_nmi(buf, (void __user *)src, nbytes);
}
@@ -115,13 +124,19 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl)
u8 opcodes[OPCODE_BUFSIZE];
unsigned long prologue = regs->ip - PROLOGUE_SIZE;
- if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
- printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n",
- loglvl, prologue);
- } else {
+ switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
+ case 0:
printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
__stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,
opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1);
+ break;
+ case -EPERM:
+ /* No access to the user space stack of other tasks. Ignore. */
+ break;
+ default:
+ printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n",
+ loglvl, prologue);
+ break;
}
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 7eb2a1c87969..3c417734790f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -161,6 +161,21 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Setup early boot stage 4-/5-level pagetables. */
addq phys_base(%rip), %rax
+
+ /*
+ * For SEV guests: Verify that the C-bit is correct. A malicious
+ * hypervisor could lie about the C-bit position to perform a ROP
+ * attack on the guest by writing to the unencrypted stack and wait for
+ * the next RET instruction.
+ * %rsi carries pointer to realmode data and is callee-clobbered. Save
+ * and restore it.
+ */
+ pushq %rsi
+ movq %rax, %rdi
+ call sev_verify_cbit
+ popq %rsi
+
+ /* Switch to new page-table */
movq %rax, %cr3
/* Ensure I am executing from virtual addresses */
@@ -279,6 +294,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
+#include "sev_verify_cbit.S"
#ifdef CONFIG_HOTPLUG_CPU
/*
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 57c2ecf43134..ce831f9448e7 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -200,8 +200,7 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
/* Copying screen_info will do? */
- memcpy(&params->screen_info, &boot_params.screen_info,
- sizeof(struct screen_info));
+ memcpy(&params->screen_info, &screen_info, sizeof(struct screen_info));
/* Fill in memsize later */
params->screen_info.ext_mem_k = 0;
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index bb7e1132290b..f9e5352b3bef 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task)
}
void perf_get_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
regs_user->regs = task_pt_regs(current);
regs_user->abi = perf_reg_abi(current);
@@ -129,12 +128,20 @@ u64 perf_reg_abi(struct task_struct *task)
return PERF_SAMPLE_REGS_ABI_64;
}
+static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs);
+
void perf_get_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
+ struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs);
struct pt_regs *user_regs = task_pt_regs(current);
+ if (!in_nmi()) {
+ regs_user->regs = user_regs;
+ regs_user->abi = perf_reg_abi(current);
+ return;
+ }
+
/*
* If we're in an NMI that interrupted task_pt_regs setup, then
* we can't sample user regs at all. This check isn't really
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba4593a913fa..145a7ac0c19a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -685,7 +685,7 @@ void arch_cpu_idle(void)
*/
void __cpuidle default_idle(void)
{
- safe_halt();
+ raw_safe_halt();
}
#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
EXPORT_SYMBOL(default_idle);
@@ -736,6 +736,8 @@ void stop_this_cpu(void *dummy)
/*
* AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
* states (local apic timer and TSC stop).
+ *
+ * XXX this function is completely buggered vs RCU and tracing.
*/
static void amd_e400_idle(void)
{
@@ -757,9 +759,9 @@ static void amd_e400_idle(void)
* The switch back from broadcast mode needs to be called with
* interrupts disabled.
*/
- local_irq_disable();
+ raw_local_irq_disable();
tick_broadcast_exit();
- local_irq_enable();
+ raw_local_irq_enable();
}
/*
@@ -801,9 +803,9 @@ static __cpuidle void mwait_idle(void)
if (!need_resched())
__sti_mwait(0, 0);
else
- local_irq_enable();
+ raw_local_irq_enable();
} else {
- local_irq_enable();
+ raw_local_irq_enable();
}
__current_clr_polling();
}
diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c
index 5f83ccaab877..7d04b356d44d 100644
--- a/arch/x86/kernel/sev-es-shared.c
+++ b/arch/x86/kernel/sev-es-shared.c
@@ -178,6 +178,32 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
goto fail;
regs->dx = val >> 32;
+ /*
+ * This is a VC handler and the #VC is only raised when SEV-ES is
+ * active, which means SEV must be active too. Do sanity checks on the
+ * CPUID results to make sure the hypervisor does not trick the kernel
+ * into the no-sev path. This could map sensitive data unencrypted and
+ * make it accessible to the hypervisor.
+ *
+ * In particular, check for:
+ * - Hypervisor CPUID bit
+ * - Availability of CPUID leaf 0x8000001f
+ * - SEV CPUID bit.
+ *
+ * The hypervisor might still report the wrong C-bit position, but this
+ * can't be checked here.
+ */
+
+ if ((fn == 1 && !(regs->cx & BIT(31))))
+ /* Hypervisor bit */
+ goto fail;
+ else if (fn == 0x80000000 && (regs->ax < 0x8000001f))
+ /* SEV leaf check */
+ goto fail;
+ else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
+ /* SEV bit */
+ goto fail;
+
/* Skip over the CPUID two-byte opcode */
regs->ip += 2;
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 4a96726fbaf8..0bd1a0fc587e 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -374,8 +374,8 @@ fault:
return ES_EXCEPTION;
}
-static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- unsigned long vaddr, phys_addr_t *paddr)
+static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned long vaddr, phys_addr_t *paddr)
{
unsigned long va = (unsigned long)vaddr;
unsigned int level;
@@ -394,15 +394,19 @@ static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
if (user_mode(ctxt->regs))
ctxt->fi.error_code |= X86_PF_USER;
- return false;
+ return ES_EXCEPTION;
}
+ if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
+ /* Emulated MMIO to/from encrypted memory not supported */
+ return ES_UNSUPPORTED;
+
pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
pa |= va & ~page_level_mask(level);
*paddr = pa;
- return true;
+ return ES_OK;
}
/* Include code shared with pre-decompression boot stage */
@@ -731,6 +735,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
{
u64 exit_code, exit_info_1, exit_info_2;
unsigned long ghcb_pa = __pa(ghcb);
+ enum es_result res;
phys_addr_t paddr;
void __user *ref;
@@ -740,11 +745,12 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
- if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) {
- if (!read)
+ res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
+ if (res != ES_OK) {
+ if (res == ES_EXCEPTION && !read)
ctxt->fi.error_code |= X86_PF_WRITE;
- return ES_EXCEPTION;
+ return res;
}
exit_info_1 = paddr;
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
new file mode 100644
index 000000000000..ee04941a6546
--- /dev/null
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sev_verify_cbit.S - Code for verification of the C-bit position reported
+ * by the Hypervisor when running with SEV enabled.
+ *
+ * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de)
+ *
+ * sev_verify_cbit() is called before switching to a new long-mode page-table
+ * at boot.
+ *
+ * Verify that the C-bit position is correct by writing a random value to
+ * an encrypted memory location while on the current page-table. Then it
+ * switches to the new page-table to verify the memory content is still the
+ * same. After that it switches back to the current page-table and when the
+ * check succeeded it returns. If the check failed the code invalidates the
+ * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to
+ * make sure no interrupt or exception can get the CPU out of the hlt loop.
+ *
+ * New page-table pointer is expected in %rdi (first parameter)
+ *
+ */
+SYM_FUNC_START(sev_verify_cbit)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* First check if a C-bit was detected */
+ movq sme_me_mask(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */
+ movq sev_status(%rip), %rsi
+ testq %rsi, %rsi
+ jz 3f
+
+ /* Save CR4 in %rsi */
+ movq %cr4, %rsi
+
+ /* Disable Global Pages */
+ movq %rsi, %rdx
+ andq $(~X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /*
+ * Verified that running under SEV - now get a random value using
+ * RDRAND. This instruction is mandatory when running as an SEV guest.
+ *
+ * Don't bail out of the loop if RDRAND returns errors. It is better to
+ * prevent forward progress than to work with a non-random value here.
+ */
+1: rdrand %rdx
+ jnc 1b
+
+ /* Store value to memory and keep it in %rdx */
+ movq %rdx, sev_check_data(%rip)
+
+ /* Backup current %cr3 value to restore it later */
+ movq %cr3, %rcx
+
+ /* Switch to new %cr3 - This might unmap the stack */
+ movq %rdi, %cr3
+
+ /*
+ * Compare value in %rdx with memory location. If C-bit is incorrect
+ * this would read the encrypted data and make the check fail.
+ */
+ cmpq %rdx, sev_check_data(%rip)
+
+ /* Restore old %cr3 */
+ movq %rcx, %cr3
+
+ /* Restore previous CR4 */
+ movq %rsi, %cr4
+
+ /* Check CMPQ result */
+ je 3f
+
+ /*
+ * The check failed, prevent any forward progress to prevent ROP
+ * attacks, invalidate the stack and go into a hlt loop.
+ */
+ xorq %rsp, %rsp
+ subq $0x1000, %rsp
+2: hlt
+ jmp 2b
+3:
+#endif
+ /* Return page-table pointer */
+ movq %rdi, %rax
+ ret
+SYM_FUNC_END(sev_verify_cbit)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 992fb1415c0f..ae64f98ec2ab 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -514,16 +514,10 @@ int tboot_force_iommu(void)
if (!tboot_enabled())
return 0;
- if (intel_iommu_tboot_noforce)
- return 1;
-
- if (no_iommu || swiotlb || dmar_disabled)
+ if (no_iommu || dmar_disabled)
pr_warn("Forcing Intel-IOMMU to enabled\n");
dmar_disabled = 0;
-#ifdef CONFIG_SWIOTLB
- swiotlb = 0;
-#endif
no_iommu = 0;
return 1;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3c70fb34028b..e19df6cde35d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -793,19 +793,6 @@ static __always_inline unsigned long debug_read_clear_dr6(void)
set_debugreg(DR6_RESERVED, 6);
dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
- /*
- * Clear the virtual DR6 value, ptrace routines will set bits here for
- * things we want signals for.
- */
- current->thread.virtual_dr6 = 0;
-
- /*
- * The SDM says "The processor clears the BTF flag when it
- * generates a debug exception." Clear TIF_BLOCKSTEP to keep
- * TIF_BLOCKSTEP in sync with the hardware BTF flag.
- */
- clear_thread_flag(TIF_BLOCKSTEP);
-
return dr6;
}
@@ -873,6 +860,20 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
*/
WARN_ON_ONCE(user_mode(regs));
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." but PTRACE_BLOCKSTEP requested
+ * it for userspace, but we just took a kernel #DB, so re-set
+ * BTF.
+ */
+ unsigned long debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl |= DEBUGCTLMSR_BTF;
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ }
+
/*
* Catch SYSENTER with TF set and clear DR_STEP. If this hit a
* watchpoint at the same time then that will still be handled.
@@ -936,6 +937,22 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
instrumentation_begin();
/*
+ * Start the virtual/ptrace DR6 value with just the DR_STEP mask
+ * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits.
+ *
+ * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6)
+ * even if it is not the result of PTRACE_SINGLESTEP.
+ */
+ current->thread.virtual_dr6 = (dr6 & DR_STEP);
+
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." Clear TIF_BLOCKSTEP to keep
+ * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+ */
+ clear_thread_flag(TIF_BLOCKSTEP);
+
+ /*
* If dr6 has no reason to give us about the origin of this trap,
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 6a339ce328e0..73f800100066 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -321,19 +321,12 @@ EXPORT_SYMBOL_GPL(unwind_get_return_address);
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
{
- struct task_struct *task = state->task;
-
if (unwind_done(state))
return NULL;
if (state->regs)
return &state->regs->ip;
- if (task != current && state->sp == task->thread.sp) {
- struct inactive_task_frame *frame = (void *)task->thread.sp;
- return &frame->ret_addr;
- }
-
if (state->sp)
return (unsigned long *)state->sp - 1;
@@ -663,7 +656,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
} else {
struct inactive_task_frame *frame = (void *)task->thread.sp;
- state->sp = task->thread.sp;
+ state->sp = task->thread.sp + sizeof(*frame);
state->bp = READ_ONCE_NOCHECK(frame->bp);
state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
state->signal = (void *)state->ip == ret_from_fork;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 06a278b3701d..83637a2ff605 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -90,6 +90,20 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
return 0;
}
+void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+
+ /*
+ * save the feature bitmap to avoid cpuid lookup for every PV
+ * operation
+ */
+ if (best)
+ vcpu->arch.pv_cpuid.features = best->eax;
+}
+
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -124,13 +138,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
- /*
- * save the feature bitmap to avoid cpuid lookup for every PV
- * operation
- */
- if (best)
- vcpu->arch.pv_cpuid.features = best->eax;
-
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
if (best)
@@ -162,6 +169,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.guest_supported_xcr0 =
(best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+ kvm_update_pv_runtime(vcpu);
+
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -169,6 +178,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.cr4_guest_rsvd_bits =
__cr4_reserved_bits(guest_cpuid_has, vcpu);
+ vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+
/* Invoke the vendor callback only after the above state is updated. */
kvm_x86_ops.vcpu_after_set_cpuid(vcpu);
}
@@ -672,7 +683,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS);
edx.split.bit_width_fixed = cap.bit_width_fixed;
- edx.split.reserved = 0;
+ edx.split.anythread_deprecated = 1;
+ edx.split.reserved1 = 0;
+ edx.split.reserved2 = 0;
entry->eax = eax.full;
entry->ebx = cap.events_mask;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index bf8577947ed2..f7a6e8f83783 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -11,6 +11,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly;
void kvm_set_cpu_caps(void);
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
+void kvm_update_pv_runtime(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0d917eb70319..56cae1ff9e3f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4046,6 +4046,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_clflushopt(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflushopt regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
static int em_movsxd(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.val = (s32) ctxt->src.val;
@@ -4585,7 +4591,7 @@ static const struct opcode group11[] = {
};
static const struct gprefix pfx_0f_ae_7 = {
- I(SrcMem | ByteOp, em_clflush), N, N, N,
+ I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N,
};
static const struct group_dual group15 = { {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 99d118ffc67d..814698e5b152 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -40,29 +40,10 @@ static int pending_userspace_extint(struct kvm_vcpu *v)
* check if there is pending interrupt from
* non-APIC source without intack.
*/
-static int kvm_cpu_has_extint(struct kvm_vcpu *v)
-{
- u8 accept = kvm_apic_accept_pic_intr(v);
-
- if (accept) {
- if (irqchip_split(v->kvm))
- return pending_userspace_extint(v);
- else
- return v->kvm->arch.vpic->output;
- } else
- return 0;
-}
-
-/*
- * check if there is injectable interrupt:
- * when virtual interrupt delivery enabled,
- * interrupt from apic will handled by hardware,
- * we don't need to check it here.
- */
-int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
{
/*
- * FIXME: interrupt.injected represents an interrupt that it's
+ * FIXME: interrupt.injected represents an interrupt whose
* side-effects have already been applied (e.g. bit from IRR
* already moved to ISR). Therefore, it is incorrect to rely
* on interrupt.injected to know if there is a pending
@@ -75,6 +56,23 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
if (!lapic_in_kernel(v))
return v->arch.interrupt.injected;
+ if (!kvm_apic_accept_pic_intr(v))
+ return 0;
+
+ if (irqchip_split(v->kvm))
+ return pending_userspace_extint(v);
+ else
+ return v->kvm->arch.vpic->output;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
if (kvm_cpu_has_extint(v))
return 1;
@@ -91,20 +89,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_injectable_intr);
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- /*
- * FIXME: interrupt.injected represents an interrupt that it's
- * side-effects have already been applied (e.g. bit from IRR
- * already moved to ISR). Therefore, it is incorrect to rely
- * on interrupt.injected to know if there is a pending
- * interrupt in the user-mode LAPIC.
- * This leads to nVMX/nSVM not be able to distinguish
- * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
- * pending interrupt or should re-inject an injected
- * interrupt.
- */
- if (!lapic_in_kernel(v))
- return v->arch.interrupt.injected;
-
if (kvm_cpu_has_extint(v))
return 1;
@@ -118,16 +102,21 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
*/
static int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
- if (kvm_cpu_has_extint(v)) {
- if (irqchip_split(v->kvm)) {
- int vector = v->arch.pending_external_vector;
-
- v->arch.pending_external_vector = -1;
- return vector;
- } else
- return kvm_pic_read_irq(v->kvm); /* PIC */
- } else
+ if (!kvm_cpu_has_extint(v)) {
+ WARN_ON(!lapic_in_kernel(v));
return -1;
+ }
+
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.nr;
+
+ if (irqchip_split(v->kvm)) {
+ int vector = v->arch.pending_external_vector;
+
+ v->arch.pending_external_vector = -1;
+ return vector;
+ } else
+ return kvm_pic_read_irq(v->kvm); /* PIC */
}
/*
@@ -135,13 +124,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
*/
int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
- int vector;
-
- if (!lapic_in_kernel(v))
- return v->arch.interrupt.nr;
-
- vector = kvm_cpu_get_extint(v);
-
+ int vector = kvm_cpu_get_extint(v);
if (vector != -1)
return vector; /* PIC */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 105e7859d1f2..86c33d53c90a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2465,7 +2465,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 ppr;
- if (!kvm_apic_hw_enabled(apic))
+ if (!kvm_apic_present(vcpu))
return -1;
__apic_update_ppr(apic, &ppr);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 17587f496ec7..7a6ae9e90bd7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -225,7 +225,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
{
u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
- gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+ gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
& shadow_nonpresent_or_rsvd_mask;
return gpa >> PAGE_SHIFT;
@@ -591,15 +591,15 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
static u64 restore_acc_track_spte(u64 spte)
{
u64 new_spte = spte;
- u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
- & shadow_acc_track_saved_bits_mask;
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
WARN_ON_ONCE(spte_ad_enabled(spte));
WARN_ON_ONCE(!is_access_track_spte(spte));
new_spte &= ~shadow_acc_track_mask;
- new_spte &= ~(shadow_acc_track_saved_bits_mask <<
- shadow_acc_track_saved_bits_shift);
+ new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
new_spte |= saved_bits;
return new_spte;
@@ -856,12 +856,14 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
} else {
rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
- desc = desc->more;
+ while (desc->sptes[PTE_LIST_EXT-1]) {
count += PTE_LIST_EXT;
- }
- if (desc->sptes[PTE_LIST_EXT-1]) {
- desc->more = mmu_alloc_pte_list_desc(vcpu);
+
+ if (!desc->more) {
+ desc->more = mmu_alloc_pte_list_desc(vcpu);
+ desc = desc->more;
+ break;
+ }
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)
@@ -3515,7 +3517,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
u64 sptes[PT64_ROOT_MAX_LEVEL];
struct rsvd_bits_validate *rsvd_check;
- int root = vcpu->arch.mmu->root_level;
+ int root = vcpu->arch.mmu->shadow_root_level;
int leaf;
int level;
bool reserved = false;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index d9c5665a55e9..fcac2cac78fe 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -55,7 +55,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
mask |= shadow_mmio_value | access;
mask |= gpa | shadow_nonpresent_or_rsvd_mask;
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
- << shadow_nonpresent_or_rsvd_mask_len;
+ << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
return mask;
}
@@ -231,12 +231,12 @@ u64 mark_spte_for_access_track(u64 spte)
!spte_can_locklessly_be_made_writable(spte),
"kvm: Writable SPTE is not locklessly dirty-trackable\n");
- WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
- shadow_acc_track_saved_bits_shift),
+ WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
"kvm: Access Tracking saved bit locations are not zero\n");
- spte |= (spte & shadow_acc_track_saved_bits_mask) <<
- shadow_acc_track_saved_bits_shift;
+ spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
spte &= ~shadow_acc_track_mask;
return spte;
@@ -245,7 +245,7 @@ u64 mark_spte_for_access_track(u64 spte)
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
- WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
+ WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
shadow_mmio_access_mask = access_mask;
@@ -306,9 +306,9 @@ void kvm_mmu_reset_all_pte_masks(void)
low_phys_bits = boot_cpu_data.x86_phys_bits;
if (boot_cpu_has_bug(X86_BUG_L1TF) &&
!WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
- 52 - shadow_nonpresent_or_rsvd_mask_len)) {
+ 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) {
low_phys_bits = boot_cpu_data.x86_cache_bits
- - shadow_nonpresent_or_rsvd_mask_len;
+ - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
shadow_nonpresent_or_rsvd_mask =
rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 4ecf40e0b8fe..5c75a451c000 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -105,19 +105,19 @@ extern u64 __read_mostly shadow_acc_track_mask;
extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
+
+/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
* as not-present for access tracking purposes. We do not save the W bit as the
* PTEs being access tracked also need to be dirty tracked, so the W bit will be
* restored only when a write is attempted to the page.
*/
-static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
- PT64_EPT_EXECUTABLE_MASK;
-static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
-
-/*
- * The number of high-order 1 bits to use in the mask above.
- */
-static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
+ PT64_EPT_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
/*
* In some cases, we need to preserve the GFN of a non-present or reserved
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 27e381c9da6c..ff28a5c6abd6 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -49,7 +49,14 @@ bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
{
struct kvm_mmu_page *sp;
+ if (!kvm->arch.tdp_mmu_enabled)
+ return false;
+ if (WARN_ON(!VALID_PAGE(hpa)))
+ return false;
+
sp = to_shadow_page(hpa);
+ if (WARN_ON(!sp))
+ return false;
return sp->tdp_mmu_page && sp->root_count;
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index c0b14106258a..566f4d18185b 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -642,8 +642,8 @@ static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
* Its safe to read more than we are asked, caller should ensure that
* destination has enough space.
*/
- src_paddr = round_down(src_paddr, 16);
offset = src_paddr & 15;
+ src_paddr = round_down(src_paddr, 16);
sz = round_up(sz + offset, 16);
return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2f32fd09e259..79b3a564f1c9 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1309,8 +1309,10 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->avic_is_running = true;
svm->msrpm = svm_vcpu_alloc_msrpm();
- if (!svm->msrpm)
+ if (!svm->msrpm) {
+ err = -ENOMEM;
goto error_free_vmcb_page;
+ }
svm_vcpu_init_msrpm(vcpu, svm->msrpm);
@@ -3741,6 +3743,7 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_cpuid_entry2 *best;
vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
boot_cpu_has(X86_FEATURE_XSAVE) &&
@@ -3753,6 +3756,13 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
/* Check again if INVPCID interception if required */
svm_check_invpcid(svm);
+ /* For sev guests, the memory encryption bit is not reserved in CR3. */
+ if (sev_guest(vcpu->kvm)) {
+ best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
+ if (best)
+ vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f));
+ }
+
if (!kvm_vcpu_apicv_active(vcpu))
return;
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index e5325bd0f304..f3199bb02f22 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -297,14 +297,13 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
-
}
#endif
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index e5f7a7ebf27d..bd41d9462355 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -185,7 +185,7 @@ static inline void evmcs_load(u64 phys_addr)
vp_ap->enlighten_vmentry = 1;
}
-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
+__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
static inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
@@ -194,7 +194,6 @@ static inline u64 evmcs_read64(unsigned long field) { return 0; }
static inline u32 evmcs_read32(unsigned long field) { return 0; }
static inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d14c94d0aff1..47b8357b9751 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2560,8 +2560,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
- if (static_branch_unlikely(&enable_evmcs))
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (enlightened_vmcs)
evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif
return 0;
}
@@ -6834,7 +6836,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx;
- unsigned long *msr_bitmap;
int i, cpu, err;
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
@@ -6894,7 +6895,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
- msr_bitmap = vmx->vmcs01.msr_bitmap;
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 397f599b20e5..e545a8a613b1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -255,24 +255,23 @@ static struct kmem_cache *x86_emulator_cache;
/*
* When called, it means the previous get/set msr reached an invalid msr.
- * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want
- * to fail the caller.
+ * Return true if we want to ignore/silent this failed msr access.
*/
-static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
- u64 data, bool write)
+static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
+ u64 data, bool write)
{
const char *op = write ? "wrmsr" : "rdmsr";
if (ignore_msrs) {
if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n",
- op, msr, data);
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
+ op, msr, data);
/* Mask the error */
- return 0;
+ return true;
} else {
- vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n",
- op, msr, data);
- return -ENOENT;
+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ return false;
}
}
@@ -1042,7 +1041,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu) &&
- (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+ (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
return 1;
else if (is_pae_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
@@ -1416,7 +1415,8 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
if (r == KVM_MSR_RET_INVALID) {
/* Unconditionally clear the output for simplicity */
*data = 0;
- r = kvm_msr_ignored_check(vcpu, index, 0, false);
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ r = 0;
}
if (r)
@@ -1540,7 +1540,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
struct msr_data msr;
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
- return -EPERM;
+ return KVM_MSR_RET_FILTERED;
switch (index) {
case MSR_FS_BASE:
@@ -1581,7 +1581,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
if (ret == KVM_MSR_RET_INVALID)
- ret = kvm_msr_ignored_check(vcpu, index, data, true);
+ if (kvm_msr_ignored_check(vcpu, index, data, true))
+ ret = 0;
return ret;
}
@@ -1599,7 +1600,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
int ret;
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
- return -EPERM;
+ return KVM_MSR_RET_FILTERED;
msr.index = index;
msr.host_initiated = host_initiated;
@@ -1618,7 +1619,8 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
if (ret == KVM_MSR_RET_INVALID) {
/* Unconditionally clear *data for simplicity */
*data = 0;
- ret = kvm_msr_ignored_check(vcpu, index, 0, false);
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ ret = 0;
}
return ret;
@@ -1662,9 +1664,9 @@ static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
static u64 kvm_msr_reason(int r)
{
switch (r) {
- case -ENOENT:
+ case KVM_MSR_RET_INVALID:
return KVM_MSR_EXIT_REASON_UNKNOWN;
- case -EPERM:
+ case KVM_MSR_RET_FILTERED:
return KVM_MSR_EXIT_REASON_FILTER;
default:
return KVM_MSR_EXIT_REASON_INVAL;
@@ -1965,7 +1967,7 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
struct kvm_arch *ka = &vcpu->kvm->arch;
if (vcpu->vcpu_id == 0 && !host_initiated) {
- if (ka->boot_vcpu_runs_old_kvmclock && old_msr)
+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
ka->boot_vcpu_runs_old_kvmclock = old_msr;
@@ -3063,9 +3065,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
- }
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
+ } else if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
break;
case 0x200 ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
@@ -3463,29 +3465,63 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->arch.time;
+ break;
case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
msr_info->data = vcpu->arch.apf.msr_en_val;
break;
case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
msr_info->data = vcpu->arch.apf.msr_int_val;
break;
case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
msr_info->data = 0;
break;
case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
+
msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
+
msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
msr_info->data = vcpu->arch.msr_kvm_poll_control;
break;
case MSR_IA32_P5_MC_ADDR:
@@ -4015,21 +4051,23 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
{
+ /*
+ * We can accept userspace's request for interrupt injection
+ * as long as we have a place to store the interrupt number.
+ * The actual injection will happen when the CPU is able to
+ * deliver the interrupt.
+ */
+ if (kvm_cpu_has_extint(vcpu))
+ return false;
+
+ /* Acknowledging ExtINT does not happen if LINT0 is masked. */
return (!lapic_in_kernel(vcpu) ||
kvm_apic_accept_pic_intr(vcpu));
}
-/*
- * if userspace requested an interrupt window, check that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
{
return kvm_arch_interrupt_allowed(vcpu) &&
- !kvm_cpu_has_interrupt(vcpu) &&
- !kvm_event_needs_reinjection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
}
@@ -4575,6 +4613,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
+ if (vcpu->arch.pv_cpuid.enforce)
+ kvm_update_pv_runtime(vcpu);
return 0;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 3900ab0c6004..e7ca622a468f 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -376,7 +376,13 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
-#define KVM_MSR_RET_INVALID 2
+/*
+ * Internal error codes that are used to indicate that MSR emulation encountered
+ * an error that should result in #GP in the guest, unless userspace
+ * handles it.
+ */
+#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */
+#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */
#define __cr4_reserved_bits(__cpu_has, __c) \
({ \
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 037faac46b0c..1e299ac73c86 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -16,8 +16,6 @@
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
*/
-.weak memcpy
-
/*
* memcpy - Copy a memory block.
*
@@ -30,7 +28,7 @@
* rax original destination
*/
SYM_FUNC_START_ALIAS(__memcpy)
-SYM_FUNC_START_LOCAL(memcpy)
+SYM_FUNC_START_WEAK(memcpy)
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memcpy_erms", X86_FEATURE_ERMS
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 7ff00ea64e4f..41902fe8b859 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -24,9 +24,7 @@
* Output:
* rax: dest
*/
-.weak memmove
-
-SYM_FUNC_START_ALIAS(memmove)
+SYM_FUNC_START_WEAK(memmove)
SYM_FUNC_START(__memmove)
mov %rdi, %rax
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9ff15ee404a4..0bfd26e4ca9e 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -6,8 +6,6 @@
#include <asm/alternative-asm.h>
#include <asm/export.h>
-.weak memset
-
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
@@ -19,7 +17,7 @@
*
* rax original destination
*/
-SYM_FUNC_START_ALIAS(memset)
+SYM_FUNC_START_WEAK(memset)
SYM_FUNC_START(__memset)
/*
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index efbb3de472df..bc0833713be9 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -39,6 +39,7 @@
*/
u64 sme_me_mask __section(".data") = 0;
u64 sev_status __section(".data") = 0;
+u64 sev_check_data __section(".data") = 0;
EXPORT_SYMBOL(sme_me_mask);
DEFINE_STATIC_KEY_FALSE(sev_enable_key);
EXPORT_SYMBOL_GPL(sev_enable_key);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 44148691d78b..5eb4dc2b97da 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -938,6 +938,7 @@ int phys_to_target_node(phys_addr_t start)
return meminfo_to_nid(&numa_reserved_meminfo, start);
}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
int memory_add_physaddr_to_nid(u64 start)
{
@@ -947,4 +948,5 @@ int memory_add_physaddr_to_nid(u64 start)
nid = numa_meminfo.blk[0].nid;
return nid;
}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8f5759df7776..e1e8d4e3a213 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -78,28 +78,30 @@ int __init efi_alloc_page_tables(void)
gfp_mask = GFP_KERNEL | __GFP_ZERO;
efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
if (!efi_pgd)
- return -ENOMEM;
+ goto fail;
pgd = efi_pgd + pgd_index(EFI_VA_END);
p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END);
- if (!p4d) {
- free_page((unsigned long)efi_pgd);
- return -ENOMEM;
- }
+ if (!p4d)
+ goto free_pgd;
pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
- if (!pud) {
- if (pgtable_l5_enabled())
- free_page((unsigned long) pgd_page_vaddr(*pgd));
- free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
- return -ENOMEM;
- }
+ if (!pud)
+ goto free_p4d;
efi_mm.pgd = efi_pgd;
mm_init_cpumask(&efi_mm);
init_new_context(NULL, &efi_mm);
return 0;
+
+free_p4d:
+ if (pgtable_l5_enabled())
+ free_page((unsigned long)pgd_page_vaddr(*pgd));
+free_pgd:
+ free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
+fail:
+ return -ENOMEM;
}
/*
diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c
index fdcd58af707a..27361cbb7ca9 100644
--- a/arch/x86/um/stub_segv.c
+++ b/arch/x86/um/stub_segv.c
@@ -8,7 +8,7 @@
#include <sysdep/mcontext.h>
#include <sys/ucontext.h>
-void __section(".__syscall_stub")
+void __attribute__ ((__section__ (".__syscall_stub")))
stub_segv_handler(int sig, siginfo_t *info, void *p)
{
ucontext_t *uc = p;
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 799f4eba0a62..043c73dfd2c9 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -93,10 +93,20 @@ void xen_init_lock_cpu(int cpu)
void xen_uninit_lock_cpu(int cpu)
{
+ int irq;
+
if (!xen_pvspin)
return;
- unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
+ /*
+ * When booting the kernel with 'mitigations=auto,nosmt', the secondary
+ * CPUs are not activated, and lock_kicker_irq is not initialized.
+ */
+ irq = per_cpu(lock_kicker_irq, cpu);
+ if (irq == -1)
+ return;
+
+ unbind_from_irqhandler(irq, NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
kfree(per_cpu(irq_name, cpu));
per_cpu(irq_name, cpu) = NULL;