From b40d0156f560932d14e3957579b6508f8d065260 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:49 +0530 Subject: perf/x86/amd/brs: Move feature-specific functions Move some of the Branch Sampling (BRS) specific functions out of the Core events sources and into the BRS sources in preparation for adding other mechanisms to record branches. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/b60283b57179475d18ee242d117c335c16733693.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/brs.c | 69 ++++++++++++++++++++++++++++++++++++++++++- arch/x86/events/amd/core.c | 70 ++------------------------------------------ arch/x86/events/perf_event.h | 7 +++-- 3 files changed, 76 insertions(+), 70 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index bee8765a1e9b..f1bff153d945 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -81,7 +81,7 @@ static bool __init amd_brs_detect(void) * a br_sel_map. Software filtering is not supported because it would not correlate well * with a sampling period. */ -int amd_brs_setup_filter(struct perf_event *event) +static int amd_brs_setup_filter(struct perf_event *event) { u64 type = event->attr.branch_sample_type; @@ -96,6 +96,73 @@ int amd_brs_setup_filter(struct perf_event *event) return 0; } +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + +int amd_brs_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + + return ret; +} + /* tos = top of stack, i.e., last valid entry written */ static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) { diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9ac3718410ce..e32a27899e11 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -330,16 +330,8 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } -#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ -static inline int amd_is_brs_event(struct perf_event *e) -{ - return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; -} - static int amd_core_hw_config(struct perf_event *event) { - int ret = 0; - if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -356,66 +348,10 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - /* - * if branch stack is requested - */ - if (has_branch_stack(event)) { - /* - * Due to interrupt holding, BRS is not recommended in - * counting mode. - */ - if (!is_sampling_event(event)) - return -EINVAL; - - /* - * Due to the way BRS operates by holding the interrupt until - * lbr_nr entries have been captured, it does not make sense - * to allow sampling on BRS with an event that does not match - * what BRS is capturing, i.e., retired taken branches. - * Otherwise the correlation with the event's period is even - * more loose: - * - * With retired taken branch: - * Effective P = P + 16 + X - * With any other event: - * Effective P = P + Y + X - * - * Where X is the number of taken branches due to interrupt - * skid. Skid is large. - * - * Where Y is the occurences of the event while BRS is - * capturing the lbr_nr entries. - * - * By using retired taken branches, we limit the impact on the - * Y variable. We know it cannot be more than the depth of - * BRS. - */ - if (!amd_is_brs_event(event)) - return -EINVAL; + if (has_branch_stack(event)) + return amd_brs_hw_config(event); - /* - * BRS implementation does not work with frequency mode - * reprogramming of the period. - */ - if (event->attr.freq) - return -EINVAL; - /* - * The kernel subtracts BRS depth from period, so it must - * be big enough. - */ - if (event->attr.sample_period <= x86_pmu.lbr_nr) - return -EINVAL; - - /* - * Check if we can allow PERF_SAMPLE_BRANCH_STACK - */ - ret = amd_brs_setup_filter(event); - - /* only set in case of success */ - if (!ret) - event->hw.flags |= PERF_X86_EVENT_AMD_BRS; - } - return ret; + return 0; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ba3d24a6a4ec..5deb34e42bbd 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1233,6 +1233,9 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); #ifdef CONFIG_PERF_EVENTS_AMD_BRS + +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ + int amd_brs_init(void); void amd_brs_disable(void); void amd_brs_enable(void); @@ -1241,7 +1244,7 @@ void amd_brs_disable_all(void); void amd_brs_drain(void); void amd_brs_lopwr_init(void); void amd_brs_disable_all(void); -int amd_brs_setup_filter(struct perf_event *event); +int amd_brs_hw_config(struct perf_event *event); void amd_brs_reset(void); static inline void amd_pmu_brs_add(struct perf_event *event) @@ -1277,7 +1280,7 @@ static inline void amd_brs_enable(void) {} static inline void amd_brs_drain(void) {} static inline void amd_brs_lopwr_init(void) {} static inline void amd_brs_disable_all(void) {} -static inline int amd_brs_setup_filter(struct perf_event *event) +static inline int amd_brs_hw_config(struct perf_event *event) { return 0; } -- cgit v1.2.3 From 9603aa79e851c652f6da873205c92213af36b24f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:50 +0530 Subject: perf/x86/amd/core: Refactor branch attributes AMD processors that are capable of recording branches support either Branch Sampling (BRS) or Last Branch Record (LBR). In preparation for adding Last Branch Record Extension Version 2 (LbrExtV2) support, reuse the "branches" capability to advertise information about both BRS and LBR but make the "branch-brs" event exclusive to Family 19h processors that support BRS. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/ba4a4cde6db79b1c65c49834027bbdb8a915546b.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e32a27899e11..2f524cf84528 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1247,23 +1247,25 @@ static ssize_t branches_show(struct device *cdev, static DEVICE_ATTR_RO(branches); -static struct attribute *amd_pmu_brs_attrs[] = { +static struct attribute *amd_pmu_branches_attrs[] = { &dev_attr_branches.attr, NULL, }; static umode_t -amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +amd_branches_is_visible(struct kobject *kobj, struct attribute *attr, int i) { return x86_pmu.lbr_nr ? attr->mode : 0; } -static struct attribute_group group_caps_amd_brs = { +static struct attribute_group group_caps_amd_branches = { .name = "caps", - .attrs = amd_pmu_brs_attrs, - .is_visible = amd_brs_is_visible, + .attrs = amd_pmu_branches_attrs, + .is_visible = amd_branches_is_visible, }; +#ifdef CONFIG_PERF_EVENTS_AMD_BRS + EVENT_ATTR_STR(branch-brs, amd_branch_brs, "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); @@ -1272,15 +1274,26 @@ static struct attribute *amd_brs_events_attrs[] = { NULL, }; +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return static_cpu_has(X86_FEATURE_BRS) && x86_pmu.lbr_nr ? + attr->mode : 0; +} + static struct attribute_group group_events_amd_brs = { .name = "events", .attrs = amd_brs_events_attrs, .is_visible = amd_brs_is_visible, }; +#endif /* CONFIG_PERF_EVENTS_AMD_BRS */ + static const struct attribute_group *amd_attr_update[] = { - &group_caps_amd_brs, + &group_caps_amd_branches, +#ifdef CONFIG_PERF_EVENTS_AMD_BRS &group_events_amd_brs, +#endif NULL, }; -- cgit v1.2.3 From 706460a96fc654e80b6bed1f562b00d2ce9f2f4d Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:51 +0530 Subject: perf/x86/amd/core: Add generic branch record interfaces AMD processors that are capable of recording branches support either Branch Sampling (BRS) or Last Branch Record (LBR). In preparation for adding Last Branch Record Extension Version 2 (LbrExtV2) support, introduce new static calls which act as gateways to call into the feature-dependent functions based on what is available on the processor. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/b75dbc32663cb395f0d701167e952c6a6b0445a3.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 2f524cf84528..ef3520731a20 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -330,6 +330,8 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +DEFINE_STATIC_CALL_RET0(amd_pmu_branch_hw_config, *x86_pmu.hw_config); + static int amd_core_hw_config(struct perf_event *event) { if (event->attr.exclude_host && event->attr.exclude_guest) @@ -349,7 +351,7 @@ static int amd_core_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PAIR; if (has_branch_stack(event)) - return amd_brs_hw_config(event); + return static_call(amd_pmu_branch_hw_config)(event); return 0; } @@ -518,8 +520,14 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +typedef void (amd_pmu_branch_reset_t)(void); +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_reset, amd_pmu_branch_reset_t); + static void amd_pmu_cpu_reset(int cpu) { + if (x86_pmu.lbr_nr) + static_call(amd_pmu_branch_reset)(); + if (x86_pmu.version < 2) return; @@ -576,7 +584,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - amd_brs_reset(); amd_pmu_cpu_reset(cpu); } @@ -771,16 +778,20 @@ static void amd_pmu_v2_disable_all(void) amd_pmu_check_overflow(); } +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_add, *x86_pmu.add); + static void amd_pmu_add_event(struct perf_event *event) { if (needs_branch_stack(event)) - amd_pmu_brs_add(event); + static_call(amd_pmu_branch_add)(event); } +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_del, *x86_pmu.del); + static void amd_pmu_del_event(struct perf_event *event) { if (needs_branch_stack(event)) - amd_pmu_brs_del(event); + static_call(amd_pmu_branch_del)(event); } /* @@ -1184,13 +1195,6 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static void amd_pmu_sched_task(struct perf_event_context *ctx, - bool sched_in) -{ - if (sched_in && x86_pmu.lbr_nr) - amd_pmu_brs_sched_task(ctx, sched_in); -} - static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) { /* @@ -1375,8 +1379,14 @@ static int __init amd_core_pmu_init(void) */ if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; - x86_pmu.sched_task = amd_pmu_sched_task; + x86_pmu.sched_task = amd_pmu_brs_sched_task; x86_pmu.limit_period = amd_pmu_limit_period; + + static_call_update(amd_pmu_branch_hw_config, amd_brs_hw_config); + static_call_update(amd_pmu_branch_reset, amd_brs_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_brs_add); + static_call_update(amd_pmu_branch_del, amd_pmu_brs_del); + /* * put_event_constraints callback same as Fam17h, set above */ -- cgit v1.2.3 From 257449c6a50298bd21dcabd644f66a0296b78532 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:52 +0530 Subject: x86/cpufeatures: Add LbrExtV2 feature bit CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 1 of EAX indicates support for Last Branch Record Extension Version 2 (LbrExtV2) features. If found to be set during PMU initialization, the EBX bits of the same leaf can be used to determine the number of available LBR entries. For better utilization of feature words, LbrExtV2 is added as a scattered feature bit. [peterz: Rename to AMD_LBR_V2] Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Acked-by: Borislav Petkov Link: https://lore.kernel.org/r/172d2b0df39306ed77221c45ee1aa62e8ae0548d.1660211399.git.sandipan.das@amd.com --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 235dc85c91c3..52bdd9465f19 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -/* FREE! ( 3*32+17) */ +#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index fd44b54c90d5..fc01f81f6e2a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -45,6 +45,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; -- cgit v1.2.3 From 703fb765f48897214e3eb110f35dddec80682f60 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:53 +0530 Subject: perf/x86/amd/lbr: Detect LbrExtV2 support AMD Last Branch Record Extension Version 2 (LbrExtV2) is driven by Core PMC overflows. It records recently taken branches up to the moment when the PMC overflow occurs. Detect the feature during PMU initialization and set the branch stack depth using CPUID leaf 0x80000022 EBX. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/fc6e45378ada258f1bab79b0de6e05c393a8f1dd.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/Makefile | 2 +- arch/x86/events/amd/core.c | 9 +++++---- arch/x86/events/amd/lbr.c | 21 +++++++++++++++++++++ arch/x86/events/perf_event.h | 2 ++ arch/x86/include/asm/perf_event.h | 3 ++- 5 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 arch/x86/events/amd/lbr.c (limited to 'arch') diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index b9f5d4610256..527d947eb76b 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o lbr.o obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index ef3520731a20..a3aa67b4fd50 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1374,10 +1374,11 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } - /* - * BRS requires special event constraints and flushing on ctxsw. - */ - if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + /* LBR and BRS are mutually exclusive features */ + if (amd_pmu_lbr_init() && !amd_brs_init()) { + /* + * BRS requires special event constraints and flushing on ctxsw. + */ x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; x86_pmu.sched_task = amd_pmu_brs_sched_task; x86_pmu.limit_period = amd_pmu_limit_period; diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c new file mode 100644 index 000000000000..4e5b5d35f35a --- /dev/null +++ b/arch/x86/events/amd/lbr.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include "../perf_event.h" + +__init int amd_pmu_lbr_init(void) +{ + union cpuid_0x80000022_ebx ebx; + + if (x86_pmu.version < 2 || !boot_cpu_has(X86_FEATURE_AMD_LBR_V2)) + return -EOPNOTSUPP; + + /* Set number of entries */ + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + x86_pmu.lbr_nr = ebx.split.lbr_v2_stack_sz; + + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + + return 0; +} diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 5deb34e42bbd..82e8a6d87ade 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1232,6 +1232,8 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); +int amd_pmu_lbr_init(void); + #ifdef CONFIG_PERF_EVENTS_AMD_BRS #define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f6fc8dd51ef4..9ac46dbe57d4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -207,7 +207,8 @@ union cpuid_0x80000022_ebx { struct { /* Number of Core Performance Counters */ unsigned int num_core_pmc:4; - unsigned int reserved:6; + /* Number of available LBR Stack Entries */ + unsigned int lbr_v2_stack_sz:6; /* Number of Data Fabric Counters */ unsigned int num_df_pmc:6; } split; -- cgit v1.2.3 From ca5b7c0d9621702e107c83216316a6d722878b64 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:54 +0530 Subject: perf/x86/amd/lbr: Add LbrExtV2 branch record support If AMD Last Branch Record Extension Version 2 (LbrExtV2) is detected, enable it alongside LBR Freeze on PMI when an event requests branch stack i.e. PERF_SAMPLE_BRANCH_STACK. Each branch record is represented by a pair of registers, LBR From and LBR To. The freeze feature prevents any updates to these registers once a PMC overflows. The contents remain unchanged until the freeze bit is cleared by the PMI handler. The branch records are read and copied to sample data before unfreezing. However, only valid entries are copied. There is no additional register to denote which of the register pairs represent the top of the stack (TOS) since internal register renaming always ensures that the first pair (i.e. index 0) is the one representing the most recent branch and so on. The LBR registers are per-thread resources and are cleared explicitly whenever a new task is scheduled in. There are no special implications on the contents of these registers when transitioning to deep C-states. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/d3b8500a3627a0d4d0259b005891ee248f248d91.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 47 ++++++--- arch/x86/events/amd/lbr.c | 203 +++++++++++++++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 8 ++ arch/x86/include/asm/msr-index.h | 5 + 4 files changed, 252 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index a3aa67b4fd50..d799628016c8 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -620,7 +620,7 @@ static inline u64 amd_pmu_get_global_status(void) /* PerfCntrGlobalStatus is read-only */ rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); - return status & amd_pmu_global_cntr_mask; + return status; } static inline void amd_pmu_ack_global_status(u64 status) @@ -631,8 +631,6 @@ static inline void amd_pmu_ack_global_status(u64 status) * clears the same bit in PerfCntrGlobalStatus */ - /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ - status &= amd_pmu_global_cntr_mask; wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); } @@ -742,11 +740,17 @@ static void amd_pmu_v2_enable_event(struct perf_event *event) __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } -static void amd_pmu_v2_enable_all(int added) +static __always_inline void amd_pmu_core_enable_all(void) { amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); } +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_lbr_enable_all(); + amd_pmu_core_enable_all(); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -771,10 +775,15 @@ static void amd_pmu_disable_all(void) amd_pmu_check_overflow(); } -static void amd_pmu_v2_disable_all(void) +static __always_inline void amd_pmu_core_disable_all(void) { - /* Disable all PMCs */ amd_pmu_set_global_ctl(0); +} + +static void amd_pmu_v2_disable_all(void) +{ + amd_pmu_core_disable_all(); + amd_pmu_lbr_disable_all(); amd_pmu_check_overflow(); } @@ -877,8 +886,8 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) pmu_enabled = cpuc->enabled; cpuc->enabled = 0; - /* Stop counting */ - amd_pmu_v2_disable_all(); + /* Stop counting but do not disable LBR */ + amd_pmu_core_disable_all(); status = amd_pmu_get_global_status(); @@ -886,6 +895,12 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!status) goto done; + /* Read branch records before unfreezing */ + if (status & GLOBAL_STATUS_LBRS_FROZEN) { + amd_pmu_lbr_read(); + status &= ~GLOBAL_STATUS_LBRS_FROZEN; + } + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -905,6 +920,9 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); @@ -918,7 +936,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) */ WARN_ON(status > 0); - /* Clear overflow bits */ + /* Clear overflow and freeze bits */ amd_pmu_ack_global_status(~status); /* @@ -932,7 +950,7 @@ done: /* Resume counting only if PMU is active */ if (pmu_enabled) - amd_pmu_v2_enable_all(0); + amd_pmu_core_enable_all(); return amd_pmu_adjust_nmi_window(handled); } @@ -1375,7 +1393,14 @@ static int __init amd_core_pmu_init(void) } /* LBR and BRS are mutually exclusive features */ - if (amd_pmu_lbr_init() && !amd_brs_init()) { + if (!amd_pmu_lbr_init()) { + /* LBR requires flushing on context switch */ + x86_pmu.sched_task = amd_pmu_lbr_sched_task; + static_call_update(amd_pmu_branch_hw_config, amd_pmu_lbr_hw_config); + static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); + static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); + } else if (!amd_brs_init()) { /* * BRS requires special event constraints and flushing on ctxsw. */ diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 4e5b5d35f35a..1dea66f332ae 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -4,6 +4,209 @@ #include "../perf_event.h" +struct branch_entry { + union { + struct { + u64 ip:58; + u64 ip_sign_ext:5; + u64 mispredict:1; + } split; + u64 full; + } from; + + union { + struct { + u64 ip:58; + u64 ip_sign_ext:3; + u64 reserved:1; + u64 spec:1; + u64 valid:1; + } split; + u64 full; + } to; +}; + +static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); +} + +static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); +} + +static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); + + return val; +} + +static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); + + return val; +} + +static __always_inline u64 sign_ext_branch_ip(u64 ip) +{ + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + return (u64)(((s64)ip << shift) >> shift); +} + +void amd_pmu_lbr_read(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_branch_entry *br = cpuc->lbr_entries; + struct branch_entry entry; + int out = 0, i; + + if (!cpuc->lbr_users) + return; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + entry.from.full = amd_pmu_lbr_get_from(i); + entry.to.full = amd_pmu_lbr_get_to(i); + + /* Check if a branch has been logged */ + if (!entry.to.split.valid) + continue; + + perf_clear_branch_entry_bitfields(br + out); + + br[out].from = sign_ext_branch_ip(entry.from.split.ip); + br[out].to = sign_ext_branch_ip(entry.to.split.ip); + br[out].mispred = entry.from.split.mispredict; + br[out].predicted = !br[out].mispred; + out++; + } + + cpuc->lbr_stack.nr = out; + + /* + * Internal register renaming always ensures that LBR From[0] and + * LBR To[0] always represent the TOS + */ + cpuc->lbr_stack.hw_idx = 0; +} + +static int amd_pmu_lbr_setup_filter(struct perf_event *event) +{ + /* No LBR support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + return 0; +} + +int amd_pmu_lbr_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* LBR is not recommended in counting mode */ + if (!is_sampling_event(event)) + return -EINVAL; + + ret = amd_pmu_lbr_setup_filter(event); + if (!ret) + event->attach_state |= PERF_ATTACH_SCHED_CB; + + return ret; +} + +void amd_pmu_lbr_reset(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + if (!x86_pmu.lbr_nr) + return; + + /* Reset all branch records individually */ + for (i = 0; i < x86_pmu.lbr_nr; i++) { + amd_pmu_lbr_set_from(i, 0); + amd_pmu_lbr_set_to(i, 0); + } + + cpuc->last_task_ctx = NULL; + cpuc->last_log_id = 0; +} + +void amd_pmu_lbr_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + perf_sched_cb_inc(event->ctx->pmu); + + if (!cpuc->lbr_users++ && !event->total_time_running) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * A context switch can flip the address space and LBR entries are + * not tagged with an identifier. Hence, branches cannot be resolved + * from the old address space and the LBR records should be wiped. + */ + if (cpuc->lbr_users && sched_in) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); +} + +void amd_pmu_lbr_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); +} + __init int amd_pmu_lbr_init(void) { union cpuid_0x80000022_ebx ebx; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 82e8a6d87ade..e8930414cba5 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1233,6 +1233,14 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); int amd_pmu_lbr_init(void); +void amd_pmu_lbr_reset(void); +void amd_pmu_lbr_read(void); +void amd_pmu_lbr_add(struct perf_event *event); +void amd_pmu_lbr_del(struct perf_event *event); +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_lbr_enable_all(void); +void amd_pmu_lbr_disable_all(void); +int amd_pmu_lbr_hw_config(struct perf_event *event); #ifdef CONFIG_PERF_EVENTS_AMD_BRS diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6674bdb096f3..109c404e6137 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -585,6 +585,9 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +/* AMD Last Branch Record MSRs */ +#define MSR_AMD64_LBR_SELECT 0xc000010e + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -756,6 +759,8 @@ #define MSR_AMD_DBG_EXTN_CFG 0xc000010f #define MSR_AMD_SAMP_BR_FROM 0xc0010300 +#define DBG_EXTN_CFG_LBRV2EN BIT_ULL(6) + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 -- cgit v1.2.3 From f4f925dae7419fc7a10af539c073871927ce3a24 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:55 +0530 Subject: perf/x86/amd/lbr: Add LbrExtV2 hardware branch filter support If AMD Last Branch Record Extension Version 2 (LbrExtV2) is detected, convert the requested branch filter (PERF_SAMPLE_BRANCH_* flags) to the corresponding hardware filter value and stash it in the event data when a branch stack is requested. The hardware filter value is also saved in per-CPU areas for use during event scheduling. Hardware filtering is provided by the LBR Branch Select register. It has bits which when set, suppress recording of the following types of branches: * CPL = 0 (Kernel only) * CPL > 0 (Userspace only) * Conditional Branches * Near Relative Calls * Near Indirect Calls * Near Returns * Near Indirect Jumps (excluding Near Indirect Calls and Near Returns) * Near Relative Jumps (excluding Near Relative Calls) * Far Branches Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/9336af5c9785b8e14c62220fc0e6cfb10ab97de3.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 21 ++++++++--- arch/x86/events/amd/lbr.c | 94 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d799628016c8..36bede1d7b1e 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -542,16 +542,24 @@ static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + cpuc->lbr_sel = kzalloc_node(sizeof(struct er_account), GFP_KERNEL, + cpu_to_node(cpu)); + if (!cpuc->lbr_sel) + return -ENOMEM; + WARN_ON_ONCE(cpuc->amd_nb); if (!x86_pmu.amd_nb_constraints) return 0; cpuc->amd_nb = amd_alloc_nb(cpu); - if (!cpuc->amd_nb) - return -ENOMEM; + if (cpuc->amd_nb) + return 0; - return 0; + kfree(cpuc->lbr_sel); + cpuc->lbr_sel = NULL; + + return -ENOMEM; } static void amd_pmu_cpu_starting(int cpu) @@ -589,13 +597,14 @@ static void amd_pmu_cpu_starting(int cpu) static void amd_pmu_cpu_dead(int cpu) { - struct cpu_hw_events *cpuhw; + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + kfree(cpuhw->lbr_sel); + cpuhw->lbr_sel = NULL; if (!x86_pmu.amd_nb_constraints) return; - cpuhw = &per_cpu(cpu_hw_events, cpu); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 1dea66f332ae..bb79b43b7cd8 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -4,6 +4,39 @@ #include "../perf_event.h" +/* LBR Branch Select valid bits */ +#define LBR_SELECT_MASK 0x1ff + +/* + * LBR Branch Select filter bits which when set, ensures that the + * corresponding type of branches are not recorded + */ +#define LBR_SELECT_KERNEL 0 /* Branches ending in CPL = 0 */ +#define LBR_SELECT_USER 1 /* Branches ending in CPL > 0 */ +#define LBR_SELECT_JCC 2 /* Conditional branches */ +#define LBR_SELECT_CALL_NEAR_REL 3 /* Near relative calls */ +#define LBR_SELECT_CALL_NEAR_IND 4 /* Indirect relative calls */ +#define LBR_SELECT_RET_NEAR 5 /* Near returns */ +#define LBR_SELECT_JMP_NEAR_IND 6 /* Near indirect jumps (excl. calls and returns) */ +#define LBR_SELECT_JMP_NEAR_REL 7 /* Near relative jumps (excl. calls) */ +#define LBR_SELECT_FAR_BRANCH 8 /* Far branches */ + +#define LBR_KERNEL BIT(LBR_SELECT_KERNEL) +#define LBR_USER BIT(LBR_SELECT_USER) +#define LBR_JCC BIT(LBR_SELECT_JCC) +#define LBR_REL_CALL BIT(LBR_SELECT_CALL_NEAR_REL) +#define LBR_IND_CALL BIT(LBR_SELECT_CALL_NEAR_IND) +#define LBR_RETURN BIT(LBR_SELECT_RET_NEAR) +#define LBR_REL_JMP BIT(LBR_SELECT_JMP_NEAR_REL) +#define LBR_IND_JMP BIT(LBR_SELECT_JMP_NEAR_IND) +#define LBR_FAR BIT(LBR_SELECT_FAR_BRANCH) +#define LBR_NOT_SUPP -1 /* unsupported filter */ +#define LBR_IGNORE 0 + +#define LBR_ANY \ + (LBR_JCC | LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | \ + LBR_REL_JMP | LBR_IND_JMP | LBR_FAR) + struct branch_entry { union { struct { @@ -97,12 +130,56 @@ void amd_pmu_lbr_read(void) cpuc->lbr_stack.hw_idx = 0; } +static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, + + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, + + [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, + + [PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT] = LBR_NOT_SUPP, +}; + static int amd_pmu_lbr_setup_filter(struct perf_event *event) { + struct hw_perf_event_extra *reg = &event->hw.branch_reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, v; + int i; + /* No LBR support */ if (!x86_pmu.lbr_nr) return -EOPNOTSUPP; + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { + if (!(br_type & BIT_ULL(i))) + continue; + + v = lbr_select_map[i]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + + if (v != LBR_IGNORE) + mask |= v; + } + + /* Filter bits operate in suppress mode */ + reg->config = mask ^ LBR_SELECT_MASK; + return 0; } @@ -137,6 +214,7 @@ void amd_pmu_lbr_reset(void) cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; + wrmsrl(MSR_AMD64_LBR_SELECT, 0); } void amd_pmu_lbr_add(struct perf_event *event) @@ -146,6 +224,11 @@ void amd_pmu_lbr_add(struct perf_event *event) if (!x86_pmu.lbr_nr) return; + if (has_branch_stack(event)) { + cpuc->lbr_select = 1; + cpuc->lbr_sel->config = event->hw.branch_reg.config; + } + perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) @@ -159,6 +242,9 @@ void amd_pmu_lbr_del(struct perf_event *event) if (!x86_pmu.lbr_nr) return; + if (has_branch_stack(event)) + cpuc->lbr_select = 0; + cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); perf_sched_cb_dec(event->ctx->pmu); @@ -180,11 +266,17 @@ void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) void amd_pmu_lbr_enable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 dbg_ctl, dbg_extn_cfg; + u64 lbr_select, dbg_ctl, dbg_extn_cfg; if (!cpuc->lbr_users || !x86_pmu.lbr_nr) return; + /* Set hardware branch filter */ + if (cpuc->lbr_select) { + lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK; + wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); + } + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); -- cgit v1.2.3 From 4462fbfe6ec1bfe2196b977010f6ce7b43a32f2c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:56 +0530 Subject: perf/x86: Move branch classifier Commit 3e702ff6d1ea ("perf/x86: Add LBR software filter support for Intel CPUs") introduces a software branch filter which complements the hardware branch filter and adds an x86 branch classifier. Move the branch classifier to arch/x86/events/ so that it can be utilized by other vendors for branch record filtering. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/bae5b95470d6bd49f40954bd379f414f5afcb965.1660211399.git.sandipan.das@amd.com --- arch/x86/events/Makefile | 2 +- arch/x86/events/intel/lbr.c | 273 ------------------------------------------- arch/x86/events/perf_event.h | 62 ++++++++++ arch/x86/events/utils.c | 216 ++++++++++++++++++++++++++++++++++ 4 files changed, 279 insertions(+), 274 deletions(-) create mode 100644 arch/x86/events/utils.c (limited to 'arch') diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9933c0e8e97a..86a76efa8bb6 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += core.o probe.o +obj-y += core.o probe.o utils.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4f70fb6c2c1e..7dffc0c731a6 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -4,7 +4,6 @@ #include #include -#include #include "../perf_event.h" @@ -65,65 +64,6 @@ #define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59)) -/* - * x86control flow change classification - * x86control flow changes include branches, interrupts, traps, faults - */ -enum { - X86_BR_NONE = 0, /* unknown */ - - X86_BR_USER = 1 << 0, /* branch target is user */ - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ - - X86_BR_CALL = 1 << 2, /* call */ - X86_BR_RET = 1 << 3, /* return */ - X86_BR_SYSCALL = 1 << 4, /* syscall */ - X86_BR_SYSRET = 1 << 5, /* syscall return */ - X86_BR_INT = 1 << 6, /* sw interrupt */ - X86_BR_IRET = 1 << 7, /* return from interrupt */ - X86_BR_JCC = 1 << 8, /* conditional */ - X86_BR_JMP = 1 << 9, /* jump */ - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ - X86_BR_ABORT = 1 << 12,/* transaction abort */ - X86_BR_IN_TX = 1 << 13,/* in transaction */ - X86_BR_NO_TX = 1 << 14,/* not in transaction */ - X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ - X86_BR_CALL_STACK = 1 << 16,/* call stack */ - X86_BR_IND_JMP = 1 << 17,/* indirect jump */ - - X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ - -}; - -#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) -#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) - -#define X86_BR_ANY \ - (X86_BR_CALL |\ - X86_BR_RET |\ - X86_BR_SYSCALL |\ - X86_BR_SYSRET |\ - X86_BR_INT |\ - X86_BR_IRET |\ - X86_BR_JCC |\ - X86_BR_JMP |\ - X86_BR_IRQ |\ - X86_BR_ABORT |\ - X86_BR_IND_CALL |\ - X86_BR_IND_JMP |\ - X86_BR_ZERO_CALL) - -#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) - -#define X86_BR_ANY_CALL \ - (X86_BR_CALL |\ - X86_BR_IND_CALL |\ - X86_BR_ZERO_CALL |\ - X86_BR_SYSCALL |\ - X86_BR_IRQ |\ - X86_BR_INT) - /* * Intel LBR_CTL bits * @@ -1143,219 +1083,6 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) return ret; } -/* - * return the type of control flow change at address "from" - * instruction is not necessarily a branch (in case of interrupt). - * - * The branch type returned also includes the priv level of the - * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). - * - * If a branch type is unknown OR the instruction cannot be - * decoded (e.g., text page not present), then X86_BR_NONE is - * returned. - */ -static int branch_type(unsigned long from, unsigned long to, int abort) -{ - struct insn insn; - void *addr; - int bytes_read, bytes_left; - int ret = X86_BR_NONE; - int ext, to_plm, from_plm; - u8 buf[MAX_INSN_SIZE]; - int is64 = 0; - - to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; - from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; - - /* - * maybe zero if lbr did not fill up after a reset by the time - * we get a PMU interrupt - */ - if (from == 0 || to == 0) - return X86_BR_NONE; - - if (abort) - return X86_BR_ABORT | to_plm; - - if (from_plm == X86_BR_USER) { - /* - * can happen if measuring at the user level only - * and we interrupt in a kernel thread, e.g., idle. - */ - if (!current->mm) - return X86_BR_NONE; - - /* may fail if text not present */ - bytes_left = copy_from_user_nmi(buf, (void __user *)from, - MAX_INSN_SIZE); - bytes_read = MAX_INSN_SIZE - bytes_left; - if (!bytes_read) - return X86_BR_NONE; - - addr = buf; - } else { - /* - * The LBR logs any address in the IP, even if the IP just - * faulted. This means userspace can control the from address. - * Ensure we don't blindly read any address by validating it is - * a known text address. - */ - if (kernel_text_address(from)) { - addr = (void *)from; - /* - * Assume we can get the maximum possible size - * when grabbing kernel data. This is not - * _strictly_ true since we could possibly be - * executing up next to a memory hole, but - * it is very unlikely to be a problem. - */ - bytes_read = MAX_INSN_SIZE; - } else { - return X86_BR_NONE; - } - } - - /* - * decoder needs to know the ABI especially - * on 64-bit systems running 32-bit apps - */ -#ifdef CONFIG_X86_64 - is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); -#endif - insn_init(&insn, addr, bytes_read, is64); - if (insn_get_opcode(&insn)) - return X86_BR_ABORT; - - switch (insn.opcode.bytes[0]) { - case 0xf: - switch (insn.opcode.bytes[1]) { - case 0x05: /* syscall */ - case 0x34: /* sysenter */ - ret = X86_BR_SYSCALL; - break; - case 0x07: /* sysret */ - case 0x35: /* sysexit */ - ret = X86_BR_SYSRET; - break; - case 0x80 ... 0x8f: /* conditional */ - ret = X86_BR_JCC; - break; - default: - ret = X86_BR_NONE; - } - break; - case 0x70 ... 0x7f: /* conditional */ - ret = X86_BR_JCC; - break; - case 0xc2: /* near ret */ - case 0xc3: /* near ret */ - case 0xca: /* far ret */ - case 0xcb: /* far ret */ - ret = X86_BR_RET; - break; - case 0xcf: /* iret */ - ret = X86_BR_IRET; - break; - case 0xcc ... 0xce: /* int */ - ret = X86_BR_INT; - break; - case 0xe8: /* call near rel */ - if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { - /* zero length call */ - ret = X86_BR_ZERO_CALL; - break; - } - fallthrough; - case 0x9a: /* call far absolute */ - ret = X86_BR_CALL; - break; - case 0xe0 ... 0xe3: /* loop jmp */ - ret = X86_BR_JCC; - break; - case 0xe9 ... 0xeb: /* jmp */ - ret = X86_BR_JMP; - break; - case 0xff: /* call near absolute, call far absolute ind */ - if (insn_get_modrm(&insn)) - return X86_BR_ABORT; - - ext = (insn.modrm.bytes[0] >> 3) & 0x7; - switch (ext) { - case 2: /* near ind call */ - case 3: /* far ind call */ - ret = X86_BR_IND_CALL; - break; - case 4: - case 5: - ret = X86_BR_IND_JMP; - break; - } - break; - default: - ret = X86_BR_NONE; - } - /* - * interrupts, traps, faults (and thus ring transition) may - * occur on any instructions. Thus, to classify them correctly, - * we need to first look at the from and to priv levels. If they - * are different and to is in the kernel, then it indicates - * a ring transition. If the from instruction is not a ring - * transition instr (syscall, systenter, int), then it means - * it was a irq, trap or fault. - * - * we have no way of detecting kernel to kernel faults. - */ - if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL - && ret != X86_BR_SYSCALL && ret != X86_BR_INT) - ret = X86_BR_IRQ; - - /* - * branch priv level determined by target as - * is done by HW when LBR_SELECT is implemented - */ - if (ret != X86_BR_NONE) - ret |= to_plm; - - return ret; -} - -#define X86_BR_TYPE_MAP_MAX 16 - -static int branch_map[X86_BR_TYPE_MAP_MAX] = { - PERF_BR_CALL, /* X86_BR_CALL */ - PERF_BR_RET, /* X86_BR_RET */ - PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ - PERF_BR_SYSRET, /* X86_BR_SYSRET */ - PERF_BR_UNKNOWN, /* X86_BR_INT */ - PERF_BR_ERET, /* X86_BR_IRET */ - PERF_BR_COND, /* X86_BR_JCC */ - PERF_BR_UNCOND, /* X86_BR_JMP */ - PERF_BR_IRQ, /* X86_BR_IRQ */ - PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_ABORT */ - PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ - PERF_BR_CALL, /* X86_BR_ZERO_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ - PERF_BR_IND, /* X86_BR_IND_JMP */ -}; - -static int -common_branch_type(int type) -{ - int i; - - type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ - - if (type) { - i = __ffs(type); - if (i < X86_BR_TYPE_MAP_MAX) - return branch_map[i]; - } - - return PERF_BR_UNKNOWN; -} - enum { ARCH_LBR_BR_TYPE_JCC = 0, ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e8930414cba5..2de9cd66347e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1210,6 +1210,68 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ + X86_BR_CALL_STACK = 1 << 16,/* call stack */ + X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_ABORT |\ + X86_BR_IND_CALL |\ + X86_BR_IND_JMP |\ + X86_BR_ZERO_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +int common_branch_type(int type); +int branch_type(unsigned long from, unsigned long to, int abort); + ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c new file mode 100644 index 000000000000..a32368945462 --- /dev/null +++ b/arch/x86/events/utils.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "perf_event.h" + +/* + * return the type of control flow change at address "from" + * instruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + */ +int branch_type(unsigned long from, unsigned long to, int abort) +{ + struct insn insn; + void *addr; + int bytes_read, bytes_left; + int ret = X86_BR_NONE; + int ext, to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (abort) + return X86_BR_ABORT | to_plm; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes_left = copy_from_user_nmi(buf, (void __user *)from, + MAX_INSN_SIZE); + bytes_read = MAX_INSN_SIZE - bytes_left; + if (!bytes_read) + return X86_BR_NONE; + + addr = buf; + } else { + /* + * The LBR logs any address in the IP, even if the IP just + * faulted. This means userspace can control the from address. + * Ensure we don't blindly read any address by validating it is + * a known text address. + */ + if (kernel_text_address(from)) { + addr = (void *)from; + /* + * Assume we can get the maximum possible size + * when grabbing kernel data. This is not + * _strictly_ true since we could possibly be + * executing up next to a memory hole, but + * it is very unlikely to be a problem. + */ + bytes_read = MAX_INSN_SIZE; + } else { + return X86_BR_NONE; + } + } + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); +#endif + insn_init(&insn, addr, bytes_read, is64); + if (insn_get_opcode(&insn)) + return X86_BR_ABORT; + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + ret = X86_BR_SYSCALL; + break; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + ret = X86_BR_SYSRET; + break; + case 0x80 ... 0x8f: /* conditional */ + ret = X86_BR_JCC; + break; + default: + ret = X86_BR_NONE; + } + break; + case 0x70 ... 0x7f: /* conditional */ + ret = X86_BR_JCC; + break; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + ret = X86_BR_RET; + break; + case 0xcf: /* iret */ + ret = X86_BR_IRET; + break; + case 0xcc ... 0xce: /* int */ + ret = X86_BR_INT; + break; + case 0xe8: /* call near rel */ + if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { + /* zero length call */ + ret = X86_BR_ZERO_CALL; + break; + } + fallthrough; + case 0x9a: /* call far absolute */ + ret = X86_BR_CALL; + break; + case 0xe0 ... 0xe3: /* loop jmp */ + ret = X86_BR_JCC; + break; + case 0xe9 ... 0xeb: /* jmp */ + ret = X86_BR_JMP; + break; + case 0xff: /* call near absolute, call far absolute ind */ + if (insn_get_modrm(&insn)) + return X86_BR_ABORT; + + ext = (insn.modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + ret = X86_BR_IND_CALL; + break; + case 4: + case 5: + ret = X86_BR_IND_JMP; + break; + } + break; + default: + ret = X86_BR_NONE; + } + /* + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented + */ + if (ret != X86_BR_NONE) + ret |= to_plm; + + return ret; +} + +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_ERET, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_IRQ, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +int common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} -- cgit v1.2.3 From f9c732249b110fae9ebf4ce33db4cb3a12c6eae3 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:57 +0530 Subject: perf/x86/amd/lbr: Add LbrExtV2 software branch filter support With AMD Last Branch Record Extension Version 2 (LbrExtV2), it is necessary to process the branch records further as hardware filtering is not granular enough for identifying certain types of branches. E.g. to record system calls, one should record far branches. The filter captures both far calls and far returns but the irrelevant records are filtered out based on the branch type as seen by the branch classifier. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/e51de057517f77788abd393c832e8dea616d489c.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/lbr.c | 92 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index bb79b43b7cd8..1a8d27e0c145 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -94,6 +94,50 @@ static __always_inline u64 sign_ext_branch_ip(u64 ip) return (u64)(((s64)ip << shift) >> shift); } +static void amd_pmu_lbr_filter(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int br_sel = cpuc->br_sel, type, i, j; + bool compress = false; + u64 from, to; + + /* If sampling all branches, there is nothing to filter */ + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) + return; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + type = branch_type(from, to, 0); + + /* If type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; /* mark invalid */ + compress = true; + } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); + } + + if (!compress) + return; + + /* Remove all invalid entries */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j - 1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } +} + void amd_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -128,6 +172,9 @@ void amd_pmu_lbr_read(void) * LBR To[0] always represent the TOS */ cpuc->lbr_stack.hw_idx = 0; + + /* Perform further software filtering */ + amd_pmu_lbr_filter(); } static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -136,8 +183,8 @@ static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, - [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL, - [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, @@ -150,8 +197,6 @@ static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, - - [PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT] = LBR_NOT_SUPP, }; static int amd_pmu_lbr_setup_filter(struct perf_event *event) @@ -165,6 +210,41 @@ static int amd_pmu_lbr_setup_filter(struct perf_event *event) if (!x86_pmu.lbr_nr) return -EOPNOTSUPP; + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* Ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_COND) + mask |= X86_BR_JCC; + + if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) + mask |= X86_BR_IND_JMP; + + if (br_type & PERF_SAMPLE_BRANCH_CALL) + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + + reg->reg = mask; + mask = 0; + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { if (!(br_type & BIT_ULL(i))) continue; @@ -220,13 +300,15 @@ void amd_pmu_lbr_reset(void) void amd_pmu_lbr_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event_extra *reg = &event->hw.branch_reg; if (!x86_pmu.lbr_nr) return; if (has_branch_stack(event)) { cpuc->lbr_select = 1; - cpuc->lbr_sel->config = event->hw.branch_reg.config; + cpuc->lbr_sel->config = reg->config; + cpuc->br_sel = reg->reg; } perf_sched_cb_inc(event->ctx->pmu); -- cgit v1.2.3 From df3e9612f758fb5f9c251cbe262e3c68ffe67b2c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:58 +0530 Subject: perf/x86: Make branch classifier fusion-aware With branch fusion and other optimizations, branch sampling hardware in some processors can report a branch from address that points to an instruction preceding the actual branch by several bytes. In such cases, the classifier cannot determine the branch type which leads to failures such as with the recently added test from commit b55878c90ab9 ("perf test: Add test for branch stack sampling"). Branch information is also easier to consume and annotate if branch from addresses always point to branch instructions. Add a new variant of the branch classifier that can account for instruction fusion. If fusion is expected and the current branch from address does not point to a branch instruction, it attempts to find the first branch within the next (MAX_INSN_SIZE - 1) bytes and if found, additionally provides the offset between the reported branch from address and the address of the expected branch instruction. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/b6bb0abaa8a54c0b6d716344700ee11a1793d709.1660211399.git.sandipan.das@amd.com --- arch/x86/events/perf_event.h | 2 + arch/x86/events/utils.c | 169 +++++++++++++++++++++++++------------------ 2 files changed, 102 insertions(+), 69 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2de9cd66347e..93263b98cd6e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1271,6 +1271,8 @@ enum { int common_branch_type(int type); int branch_type(unsigned long from, unsigned long to, int abort); +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset); ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index a32368945462..e013243f360c 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -3,6 +3,68 @@ #include "perf_event.h" +static int decode_branch_type(struct insn *insn) +{ + int ext; + + if (insn_get_opcode(insn)) + return X86_BR_ABORT; + + switch (insn->opcode.bytes[0]) { + case 0xf: + switch (insn->opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + return X86_BR_SYSCALL; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + return X86_BR_SYSRET; + case 0x80 ... 0x8f: /* conditional */ + return X86_BR_JCC; + } + return X86_BR_NONE; + case 0x70 ... 0x7f: /* conditional */ + return X86_BR_JCC; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + return X86_BR_RET; + case 0xcf: /* iret */ + return X86_BR_IRET; + case 0xcc ... 0xce: /* int */ + return X86_BR_INT; + case 0xe8: /* call near rel */ + if (insn_get_immediate(insn) || insn->immediate1.value == 0) { + /* zero length call */ + return X86_BR_ZERO_CALL; + } + fallthrough; + case 0x9a: /* call far absolute */ + return X86_BR_CALL; + case 0xe0 ... 0xe3: /* loop jmp */ + return X86_BR_JCC; + case 0xe9 ... 0xeb: /* jmp */ + return X86_BR_JMP; + case 0xff: /* call near absolute, call far absolute ind */ + if (insn_get_modrm(insn)) + return X86_BR_ABORT; + + ext = (insn->modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + return X86_BR_IND_CALL; + case 4: + case 5: + return X86_BR_IND_JMP; + } + return X86_BR_NONE; + } + + return X86_BR_NONE; +} + /* * return the type of control flow change at address "from" * instruction is not necessarily a branch (in case of interrupt). @@ -13,14 +75,22 @@ * If a branch type is unknown OR the instruction cannot be * decoded (e.g., text page not present), then X86_BR_NONE is * returned. + * + * While recording branches, some processors can report the "from" + * address to be that of an instruction preceding the actual branch + * when instruction fusion occurs. If fusion is expected, attempt to + * find the type of the first branch instruction within the next + * MAX_INSN_SIZE bytes and if found, provide the offset between the + * reported "from" address and the actual branch instruction address. */ -int branch_type(unsigned long from, unsigned long to, int abort) +static int get_branch_type(unsigned long from, unsigned long to, int abort, + bool fused, int *offset) { struct insn insn; void *addr; - int bytes_read, bytes_left; + int bytes_read, bytes_left, insn_offset; int ret = X86_BR_NONE; - int ext, to_plm, from_plm; + int to_plm, from_plm; u8 buf[MAX_INSN_SIZE]; int is64 = 0; @@ -83,77 +153,27 @@ int branch_type(unsigned long from, unsigned long to, int abort) is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); #endif insn_init(&insn, addr, bytes_read, is64); - if (insn_get_opcode(&insn)) - return X86_BR_ABORT; + ret = decode_branch_type(&insn); + insn_offset = 0; - switch (insn.opcode.bytes[0]) { - case 0xf: - switch (insn.opcode.bytes[1]) { - case 0x05: /* syscall */ - case 0x34: /* sysenter */ - ret = X86_BR_SYSCALL; - break; - case 0x07: /* sysret */ - case 0x35: /* sysexit */ - ret = X86_BR_SYSRET; - break; - case 0x80 ... 0x8f: /* conditional */ - ret = X86_BR_JCC; - break; - default: - ret = X86_BR_NONE; - } - break; - case 0x70 ... 0x7f: /* conditional */ - ret = X86_BR_JCC; - break; - case 0xc2: /* near ret */ - case 0xc3: /* near ret */ - case 0xca: /* far ret */ - case 0xcb: /* far ret */ - ret = X86_BR_RET; - break; - case 0xcf: /* iret */ - ret = X86_BR_IRET; - break; - case 0xcc ... 0xce: /* int */ - ret = X86_BR_INT; - break; - case 0xe8: /* call near rel */ - if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { - /* zero length call */ - ret = X86_BR_ZERO_CALL; + /* Check for the possibility of branch fusion */ + while (fused && ret == X86_BR_NONE) { + /* Check for decoding errors */ + if (insn_get_length(&insn) || !insn.length) break; - } - fallthrough; - case 0x9a: /* call far absolute */ - ret = X86_BR_CALL; - break; - case 0xe0 ... 0xe3: /* loop jmp */ - ret = X86_BR_JCC; - break; - case 0xe9 ... 0xeb: /* jmp */ - ret = X86_BR_JMP; - break; - case 0xff: /* call near absolute, call far absolute ind */ - if (insn_get_modrm(&insn)) - return X86_BR_ABORT; - ext = (insn.modrm.bytes[0] >> 3) & 0x7; - switch (ext) { - case 2: /* near ind call */ - case 3: /* far ind call */ - ret = X86_BR_IND_CALL; + insn_offset += insn.length; + bytes_read -= insn.length; + if (bytes_read < 0) break; - case 4: - case 5: - ret = X86_BR_IND_JMP; - break; - } - break; - default: - ret = X86_BR_NONE; + + insn_init(&insn, addr + insn_offset, bytes_read, is64); + ret = decode_branch_type(&insn); } + + if (offset) + *offset = insn_offset; + /* * interrupts, traps, faults (and thus ring transition) may * occur on any instructions. Thus, to classify them correctly, @@ -179,6 +199,17 @@ int branch_type(unsigned long from, unsigned long to, int abort) return ret; } +int branch_type(unsigned long from, unsigned long to, int abort) +{ + return get_branch_type(from, to, abort, false, NULL); +} + +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset) +{ + return get_branch_type(from, to, abort, true, offset); +} + #define X86_BR_TYPE_MAP_MAX 16 static int branch_map[X86_BR_TYPE_MAP_MAX] = { -- cgit v1.2.3 From 245268c19f701c7222dedcb6a383bc73d63925d4 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:59 +0530 Subject: perf/x86/amd/lbr: Use fusion-aware branch classifier AMD Last Branch Record Extension Version 2 (LbrExtV2) can report a branch from address that points to an instruction preceding the actual branch by several bytes due to branch fusion and further optimizations in Zen4 processors. In such cases, software should move forward sequentially in the instruction stream from the reported address and the address of the first branch encountered should be used instead. Hence, use the fusion-aware branch classifier to determine the correct branch type and get the offset for adjusting the branch from address. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/c324d2d0a9c3976da30b9563d09e50bfee0f264d.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/lbr.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 1a8d27e0c145..eb84f196b2ca 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -97,7 +97,7 @@ static __always_inline u64 sign_ext_branch_ip(u64 ip) static void amd_pmu_lbr_filter(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int br_sel = cpuc->br_sel, type, i, j; + int br_sel = cpuc->br_sel, offset, type, i, j; bool compress = false; u64 from, to; @@ -109,7 +109,15 @@ static void amd_pmu_lbr_filter(void) for (i = 0; i < cpuc->lbr_stack.nr; i++) { from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; - type = branch_type(from, to, 0); + type = branch_type_fused(from, to, 0, &offset); + + /* + * Adjust the branch from address in case of instruction + * fusion where it points to an instruction preceding the + * actual branch + */ + if (offset) + cpuc->lbr_entries[i].from += offset; /* If type does not correspond, then discard */ if (type == X86_BR_NONE || (br_sel & type) != type) { -- cgit v1.2.3 From 0bc3be5b4bfd5b75086c26d63584a6f7aaea87d5 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 18:00:01 +0530 Subject: perf/x86/amd/lbr: Add LbrExtV2 branch speculation info support Provide branch speculation information captured via AMD Last Branch Record Extension Version 2 (LbrExtV2) by setting the speculation info in branch records. The info is based on the "valid" and "spec" bits in the Branch To registers. Suggested-by: Stephane Eranian Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/ddc02f6320464cad0e3ff5bdb2314531568a91bc.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/lbr.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index eb84f196b2ca..2e1c1573efe7 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -146,12 +146,19 @@ static void amd_pmu_lbr_filter(void) } } +static const int lbr_spec_map[PERF_BR_SPEC_MAX] = { + PERF_BR_SPEC_NA, + PERF_BR_SPEC_WRONG_PATH, + PERF_BR_NON_SPEC_CORRECT_PATH, + PERF_BR_SPEC_CORRECT_PATH, +}; + void amd_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_branch_entry *br = cpuc->lbr_entries; struct branch_entry entry; - int out = 0, i; + int out = 0, idx, i; if (!cpuc->lbr_users) return; @@ -160,8 +167,11 @@ void amd_pmu_lbr_read(void) entry.from.full = amd_pmu_lbr_get_from(i); entry.to.full = amd_pmu_lbr_get_to(i); - /* Check if a branch has been logged */ - if (!entry.to.split.valid) + /* + * Check if a branch has been logged; if valid = 0, spec = 0 + * then no branch was recorded + */ + if (!entry.to.split.valid && !entry.to.split.spec) continue; perf_clear_branch_entry_bitfields(br + out); @@ -170,6 +180,25 @@ void amd_pmu_lbr_read(void) br[out].to = sign_ext_branch_ip(entry.to.split.ip); br[out].mispred = entry.from.split.mispredict; br[out].predicted = !br[out].mispred; + + /* + * Set branch speculation information using the status of + * the valid and spec bits. + * + * When valid = 0, spec = 0, no branch was recorded and the + * entry is discarded as seen above. + * + * When valid = 0, spec = 1, the recorded branch was + * speculative but took the wrong path. + * + * When valid = 1, spec = 0, the recorded branch was + * non-speculative but took the correct path. + * + * When valid = 1, spec = 1, the recorded branch was + * speculative and took the correct path + */ + idx = (entry.to.split.valid << 1) | entry.to.split.spec; + br[out].spec = lbr_spec_map[idx]; out++; } -- cgit v1.2.3 From a724ec82966d57e4b5d36341d3e3dc1a3c011564 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:15 +0530 Subject: perf: Add system error and not in transaction branch types This expands generic branch type classification by adding two more entries there in i.e system error and not in transaction. This also updates the x86 implementation to process X86_BR_NO_TX records as appropriate. This changes branch types reported to user space on x86 platform but it should not be a problem. The possible scenarios and impacts are enumerated here. -------------------------------------------------------------------------- | kernel | perf tool | Impact | -------------------------------------------------------------------------- | old | old | Works as before | -------------------------------------------------------------------------- | old | new | PERF_BR_UNKNOWN is processed | -------------------------------------------------------------------------- | new | old | PERF_BR_NO_TX is blocked via old PERF_BR_MAX | -------------------------------------------------------------------------- | new | new | PERF_BR_NO_TX is recognized | -------------------------------------------------------------------------- When PERF_BR_NO_TX is blocked via old PERF_BR_MAX (new kernel with old perf tool) the user space might throw up an warning complaining about an unrecognized branch types being reported, but it's expected. PERF_BR_SERROR & PERF_BR_NO_TX branch types will be used for BRBE implementation on arm64 platform. PERF_BR_NO_TX complements 'abort' and 'in_tx' elements in perf_branch_entry which represent other transaction states for a given branch record. Because this completes the transaction state classification. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-2-anshuman.khandual@arm.com --- arch/x86/events/utils.c | 2 +- include/uapi/linux/perf_event.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index e013243f360c..5f5617afde79 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -225,7 +225,7 @@ static int branch_map[X86_BR_TYPE_MAP_MAX] = { PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ PERF_BR_UNKNOWN, /* X86_BR_ABORT */ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_NO_TX, /* X86_BR_NO_TX */ PERF_BR_CALL, /* X86_BR_ZERO_CALL */ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ PERF_BR_IND, /* X86_BR_IND_JMP */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 30a4723aefd4..a79cc0eb4de7 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -253,6 +253,8 @@ enum { PERF_BR_COND_RET = 10, /* conditional function return */ PERF_BR_ERET = 11, /* exception return */ PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ PERF_BR_MAX, }; -- cgit v1.2.3 From be3f152568cc7f5f573d21d5f86a2c4f3cc047ab Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:11 +0200 Subject: perf/hw_breakpoint: Optimize constant number of breakpoint slots Optimize internal hw_breakpoint state if the architecture's number of breakpoint slots is constant. This avoids several kmalloc() calls and potentially unnecessary failures if the allocations fail, as well as subtly improves code generation and cache locality. The protocol is that if an architecture defines hw_breakpoint_slots via the preprocessor, it must be constant and the same for all types. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-7-elver@google.com --- arch/sh/include/asm/hw_breakpoint.h | 5 +- arch/x86/include/asm/hw_breakpoint.h | 5 +- kernel/events/hw_breakpoint.c | 94 +++++++++++++++++++++++------------- 3 files changed, 63 insertions(+), 41 deletions(-) (limited to 'arch') diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h index 199d17b765f2..361a0f57bdeb 100644 --- a/arch/sh/include/asm/hw_breakpoint.h +++ b/arch/sh/include/asm/hw_breakpoint.h @@ -48,10 +48,7 @@ struct pmu; /* Maximum number of UBC channels */ #define HBP_NUM 2 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) /* arch/sh/kernel/hw_breakpoint.c */ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index a1f0e90d0818..0bc931cd0698 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -44,10 +44,7 @@ struct arch_hw_breakpoint { /* Total number of available HW breakpoint registers */ #define HBP_NUM 4 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) struct perf_event_attr; struct perf_event; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 7df46b276452..9fb66d358d81 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -40,13 +40,16 @@ struct bp_cpuinfo { /* Number of pinned cpu breakpoints in a cpu */ unsigned int cpu_pinned; /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ +#ifdef hw_breakpoint_slots + unsigned int tsk_pinned[hw_breakpoint_slots(0)]; +#else unsigned int *tsk_pinned; +#endif /* Number of non-pinned cpu/task breakpoints in a cpu */ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX] __ro_after_init; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { @@ -73,6 +76,54 @@ struct bp_busy_slots { /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); +#ifdef hw_breakpoint_slots +/* + * Number of breakpoint slots is constant, and the same for all types. + */ +static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); +static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } +static inline int init_breakpoint_slots(void) { return 0; } +#else +/* + * Dynamic number of breakpoint slots. + */ +static int __nr_bp_slots[TYPE_MAX] __ro_after_init; + +static inline int hw_breakpoint_slots_cached(int type) +{ + return __nr_bp_slots[type]; +} + +static __init int init_breakpoint_slots(void) +{ + int i, cpu, err_cpu; + + for (i = 0; i < TYPE_MAX; i++) + __nr_bp_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL); + if (!info->tsk_pinned) + goto err; + } + } + + return 0; +err: + for_each_possible_cpu(err_cpu) { + for (i = 0; i < TYPE_MAX; i++) + kfree(get_bp_info(err_cpu, i)->tsk_pinned); + if (err_cpu == cpu) + break; + } + + return -ENOMEM; +} +#endif + __weak int hw_breakpoint_weight(struct perf_event *bp) { return 1; @@ -95,7 +146,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; - for (i = nr_slots[type] - 1; i >= 0; i--) { + for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) return i + 1; } @@ -312,7 +363,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + if (slots.pinned + (!!slots.flexible) > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); @@ -632,7 +683,7 @@ bool hw_breakpoint_is_used(void) if (info->cpu_pinned) return true; - for (int slot = 0; slot < nr_slots[type]; ++slot) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { if (info->tsk_pinned[slot]) return true; } @@ -716,42 +767,19 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - int cpu, err_cpu; - int i, ret; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); - - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - struct bp_cpuinfo *info = get_bp_info(cpu, i); - - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), - GFP_KERNEL); - if (!info->tsk_pinned) { - ret = -ENOMEM; - goto err; - } - } - } + int ret; ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); if (ret) - goto err; + return ret; + + ret = init_breakpoint_slots(); + if (ret) + return ret; constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - -err: - for_each_possible_cpu(err_cpu) { - for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); - if (err_cpu == cpu) - break; - } - - return ret; } -- cgit v1.2.3 From f95e5a3d59011eec1257d0e76de1e1f8969d426f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:14 +0200 Subject: powerpc/hw_breakpoint: Avoid relying on caller synchronization Internal data structures (cpu_bps, task_bps) of powerpc's hw_breakpoint implementation have relied on nr_bp_mutex serializing access to them. Before overhauling synchronization of kernel/events/hw_breakpoint.c, introduce 2 spinlocks to synchronize cpu_bps and task_bps respectively, thus avoiding reliance on callers synchronizing powerpc's hw_breakpoint. Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-10-elver@google.com --- arch/powerpc/kernel/hw_breakpoint.c | 53 ++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 2669f80b3a49..8db1a15d7acb 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -129,7 +130,14 @@ struct breakpoint { bool ptrace_bp; }; +/* + * While kernel/events/hw_breakpoint.c does its own synchronization, we cannot + * rely on it safely synchronizing internals here; however, we can rely on it + * not requesting more breakpoints than available. + */ +static DEFINE_SPINLOCK(cpu_bps_lock); static DEFINE_PER_CPU(struct breakpoint *, cpu_bps[HBP_NUM_MAX]); +static DEFINE_SPINLOCK(task_bps_lock); static LIST_HEAD(task_bps); static struct breakpoint *alloc_breakpoint(struct perf_event *bp) @@ -174,7 +182,9 @@ static int task_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp); + spin_lock(&task_bps_lock); list_add(&tmp->list, &task_bps); + spin_unlock(&task_bps_lock); return 0; } @@ -182,6 +192,7 @@ static void task_bps_remove(struct perf_event *bp) { struct list_head *pos, *q; + spin_lock(&task_bps_lock); list_for_each_safe(pos, q, &task_bps) { struct breakpoint *tmp = list_entry(pos, struct breakpoint, list); @@ -191,6 +202,7 @@ static void task_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&task_bps_lock); } /* @@ -200,12 +212,17 @@ static void task_bps_remove(struct perf_event *bp) static bool all_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false; + spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { - if (!can_co_exist(tmp, bp)) - return true; + if (!can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; } /* @@ -215,13 +232,18 @@ static bool all_task_bps_check(struct perf_event *bp) static bool same_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false; + spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { if (tmp->bp->hw.target == bp->hw.target && - !can_co_exist(tmp, bp)) - return true; + !can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; } static int cpu_bps_add(struct perf_event *bp) @@ -234,6 +256,7 @@ static int cpu_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp); + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) { @@ -241,6 +264,7 @@ static int cpu_bps_add(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); return 0; } @@ -249,6 +273,7 @@ static void cpu_bps_remove(struct perf_event *bp) struct breakpoint **cpu_bp; int i = 0; + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) @@ -260,19 +285,25 @@ static void cpu_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); } static bool cpu_bps_check(int cpu, struct perf_event *bp) { struct breakpoint **cpu_bp; + bool ret = false; int i; + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, cpu); for (i = 0; i < nr_wp_slots(); i++) { - if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) - return true; + if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&cpu_bps_lock); + return ret; } static bool all_cpu_bps_check(struct perf_event *bp) @@ -286,10 +317,6 @@ static bool all_cpu_bps_check(struct perf_event *bp) return false; } -/* - * We don't use any locks to serialize accesses to cpu_bps or task_bps - * because are already inside nr_bp_mutex. - */ int arch_reserve_bp_slot(struct perf_event *bp) { int ret; -- cgit v1.2.3 From 47a3aeb39e8dc099ae431cd8b46bdf218f5511b2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:55 -0700 Subject: perf/x86/intel/pebs: Fix PEBS timestamps overwritten The PEBS TSC-based timestamps do not appear correctly in the final perf.data output file from perf record. The data->time field setup by PEBS in the setup_pebs_fixed_sample_data() is later overwritten by perf_events generic code in perf_prepare_sample(). There is an ordering problem. Set the sample flags when the data->time is updated by PEBS. The data->time field will not be overwritten anymore. Reported-by: Andreas Kogler Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-3-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba60427caa6d..cdd857bd2dd6 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1635,8 +1635,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * We can only do this for the default trace clock. */ if (x86_pmu.intel_cap.pebs_format >= 3 && - event->attr.use_clockid == 0) + event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(pebs->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } if (has_branch_stack(event)) data->br_stack = &cpuc->lbr_stack; @@ -1697,8 +1699,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; - if (event->attr.use_clockid == 0) + if (event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(basic->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } /* * We must however always use iregs for the unwinder to stay sane; the -- cgit v1.2.3 From a9a931e2666878343782c82d7d55cc173ddeb3e9 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:56 -0700 Subject: perf: Use sample_flags for branch stack Use the new sample_flags to indicate whether the branch stack is filled by the PMU driver. Remove the br_stack from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-4-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 1 + arch/x86/events/amd/core.c | 4 +++- arch/x86/events/core.c | 4 +++- arch/x86/events/intel/core.c | 4 +++- arch/x86/events/intel/ds.c | 5 ++++- include/linux/perf_event.h | 4 ++-- kernel/events/core.c | 4 ++-- 7 files changed, 18 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 13919eb96931..1ad1efdb33f9 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2297,6 +2297,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, cpuhw = this_cpu_ptr(&cpu_hw_events); power_pmu_bhrb_read(event, cpuhw); data.br_stack = &cpuhw->bhrb_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 36bede1d7b1e..bd99d2ae14c3 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -929,8 +929,10 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f969410d0c90..bb34a28fa71b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1714,8 +1714,10 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2db93498ff71..ba101c28dcc9 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2995,8 +2995,10 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index cdd857bd2dd6..0489f750baa0 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1640,8 +1640,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_TIME; } - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1791,6 +1793,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0978165a2d87..1e12e79454e0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1011,7 +1011,6 @@ struct perf_sample_data { u64 sample_flags; u64 addr; struct perf_raw_record *raw; - struct perf_branch_stack *br_stack; u64 period; union perf_sample_weight weight; u64 txn; @@ -1021,6 +1020,8 @@ struct perf_sample_data { * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ + struct perf_branch_stack *br_stack; + u64 type; u64 ip; struct { @@ -1061,7 +1062,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->sample_flags = 0; data->addr = addr; data->raw = NULL; - data->br_stack = NULL; data->period = period; data->weight.full = 0; data->data_src.val = PERF_MEM_NA; diff --git a/kernel/events/core.c b/kernel/events/core.c index c9b9cb79231a..104c0c9f4e6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7052,7 +7052,7 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { size_t size; size = data->br_stack->nr @@ -7358,7 +7358,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { if (perf_sample_save_hw_index(event)) size += sizeof(u64); -- cgit v1.2.3 From 2abe681da0a192ab850a5271d838a7817b469fca Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:57 -0700 Subject: perf: Use sample_flags for weight Use the new sample_flags to indicate whether the weight field is filled by the PMU driver. Remove the weight field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-5-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 5 +++-- arch/x86/events/intel/ds.c | 10 +++++++--- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 14 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 1ad1efdb33f9..a5c95a2006ea 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2305,9 +2305,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && - ppmu->get_mem_weight) + ppmu->get_mem_weight) { ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type); - + data.sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); } else if (period) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 0489f750baa0..4c51118e4add 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1527,8 +1527,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) { data->weight.full = pebs->lat; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } /* * data.data_src encodes the data source @@ -1620,9 +1622,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) { data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); - + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (sample_type & PERF_SAMPLE_TRANSACTION) data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); @@ -1764,6 +1767,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1e12e79454e0..06a587b5faa9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; @@ -1021,6 +1020,7 @@ struct perf_sample_data { * perf_{prepare,output}_sample(). */ struct perf_branch_stack *br_stack; + union perf_sample_weight weight; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 104c0c9f4e6f..f0af45db02b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7408,6 +7408,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3 From e16fd7f2cb1a65555cfe76f983eaefb1eab7471f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:58 -0700 Subject: perf: Use sample_flags for data_src Use the new sample_flags to indicate whether the data_src field is filled by the PMU driver. Remove the data_src field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-6-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 4 +++- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index a5c95a2006ea..6ec7069e6482 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2301,8 +2301,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && - ppmu->get_mem_data_src) + ppmu->get_mem_data_src) { ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); + data.sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && ppmu->get_mem_weight) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4c51118e4add..bde73d492889 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1535,8 +1535,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * data.data_src encodes the data source */ - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, pebs->dse); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } /* * We must however always use iregs for the unwinder to stay sane; the @@ -1770,8 +1772,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, meminfo->aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 06a587b5faa9..6849f10dfc7e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1013,7 +1013,6 @@ struct perf_sample_data { struct perf_raw_record *raw; u64 period; u64 txn; - union perf_mem_data_src data_src; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { */ struct perf_branch_stack *br_stack; union perf_sample_weight weight; + union perf_mem_data_src data_src; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index f0af45db02b3..163e2f478e61 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7411,6 +7411,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) data->weight.full = 0; + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = PERF_MEM_NA; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3 From ee9db0e14b0575aa827579dc2471a29ec5fc6877 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:59 -0700 Subject: perf: Use sample_flags for txn Use the new sample_flags to indicate whether the txn field is filled by the PMU driver. Remove the txn field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-7-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index bde73d492889..a5275c235c2a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1628,9 +1628,11 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } /* @@ -1780,9 +1782,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, gprs ? gprs->ax : 0); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } if (format_size & PEBS_DATACFG_XMMS) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6849f10dfc7e..581880ddb9ef 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - u64 txn; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; union perf_sample_weight weight; union perf_mem_data_src data_src; + u64 txn; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->txn = 0; } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 163e2f478e61..15d27b14c827 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = PERF_MEM_NA; + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3 From 88081cfb699ce2568e5309c145eb9f9e9497b53f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:24 +0530 Subject: x86/perf: Assert all platform event flags are within PERF_EVENT_FLAG_ARCH Ensure all platform specific event flags are within PERF_EVENT_FLAG_ARCH. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-5-anshuman.khandual@arm.com --- arch/x86/events/perf_event.h | 34 ++++++++++++++++------------------ arch/x86/events/perf_event_flags.h | 22 ++++++++++++++++++++++ 2 files changed, 38 insertions(+), 18 deletions(-) create mode 100644 arch/x86/events/perf_event_flags.h (limited to 'arch') diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 93263b98cd6e..4a3dde24a1ae 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -64,27 +64,25 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) return ((ecode & c->cmask) - c->code) <= (u64)c->size; } +#define PERF_ARCH(name, val) \ + PERF_X86_EVENT_##name = val, + /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ - -#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ -#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ -#define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */ +enum { +#include "perf_event_flags.h" +}; + +#undef PERF_ARCH + +#define PERF_ARCH(name, val) \ + static_assert((PERF_X86_EVENT_##name & PERF_EVENT_FLAG_ARCH) == \ + PERF_X86_EVENT_##name); + +#include "perf_event_flags.h" + +#undef PERF_ARCH static inline bool is_topdown_count(struct perf_event *event) { diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h new file mode 100644 index 000000000000..1dc19b9b4426 --- /dev/null +++ b/arch/x86/events/perf_event_flags.h @@ -0,0 +1,22 @@ + +/* + * struct hw_perf_event.flags flags + */ +PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ + /* 0x00080 */ +PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ -- cgit v1.2.3 From 73759c346341d39dfde39701476c0376dea0a98b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:27:22 +0200 Subject: perf/x86: Add two more x86_pmu methods In order to clean up x86_perf_event_{set_period,update)() start by adding them as x86_pmu methods. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.440196408@infradead.org --- arch/x86/events/core.c | 22 +++++++++++++++++----- arch/x86/events/perf_event.h | 5 +++++ 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index bb34a28fa71b..bb559b79ff23 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -72,6 +72,9 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); +DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); +DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); + DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); @@ -1518,7 +1521,7 @@ static void x86_pmu_start(struct perf_event *event, int flags) if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); } event->hw.state = 0; @@ -1610,7 +1613,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } @@ -1700,7 +1703,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; - val = x86_perf_event_update(event); + val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; @@ -1709,7 +1712,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) */ handled++; - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, event->hw.last_period); @@ -2025,6 +2028,9 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); + static_call_update(x86_pmu_set_period, x86_pmu.set_period); + static_call_update(x86_pmu_update, x86_pmu.update); + static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); @@ -2044,7 +2050,7 @@ static void x86_pmu_static_call_update(void) static void _x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, @@ -2151,6 +2157,12 @@ static int __init init_hw_perf_events(void) if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; + if (!x86_pmu.set_period) + x86_pmu.set_period = x86_perf_event_set_period; + + if (!x86_pmu.update) + x86_pmu.update = x86_perf_event_update; + x86_pmu_static_call_update(); /* diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4a3dde24a1ae..7ae1a6c5368c 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -743,6 +743,8 @@ struct x86_pmu { void (*add)(struct perf_event *); void (*del)(struct perf_event *); void (*read)(struct perf_event *event); + int (*set_period)(struct perf_event *event); + u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -1042,6 +1044,9 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; +DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); +DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); + static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { if (static_cpu_has(X86_FEATURE_ARCH_LBR)) -- cgit v1.2.3 From e577bb17a1eaa35b86ee873a786e603be768d668 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:28:06 +0200 Subject: perf/x86/intel: Move the topdown stuff into the intel driver Use the new x86_pmu::{set_period,update}() methods to push the topdown stuff into the Intel driver, where it belongs. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.505933457@infradead.org --- arch/x86/events/core.c | 7 ------- arch/x86/events/intel/core.c | 26 +++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index bb559b79ff23..b074e71bab21 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -119,9 +119,6 @@ u64 x86_perf_event_update(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; - if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) - return x86_pmu.update_topdown_event(event); - /* * Careful: an NMI might modify the previous event value. * @@ -1373,10 +1370,6 @@ int x86_perf_event_set_period(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; - if (unlikely(is_topdown_count(event)) && - x86_pmu.set_topdown_event_period) - return x86_pmu.set_topdown_event_period(event); - /* * If we are way outside a reasonable range then just skip forward: */ diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ba101c28dcc9..feed732fdf57 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2302,7 +2302,7 @@ static void intel_pmu_nhm_workaround(void) for (i = 0; i < 4; i++) { event = cpuc->events[i]; if (event) - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); } for (i = 0; i < 4; i++) { @@ -2317,7 +2317,7 @@ static void intel_pmu_nhm_workaround(void) event = cpuc->events[i]; if (event) { - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } else @@ -2794,7 +2794,7 @@ static void intel_pmu_add_event(struct perf_event *event) */ int intel_pmu_save_and_restart(struct perf_event *event) { - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); /* * For a checkpointed counter always reset back to 0. This * avoids a situation where the counter overflows, aborts the @@ -2806,9 +2806,27 @@ int intel_pmu_save_and_restart(struct perf_event *event) wrmsrl(event->hw.event_base, 0); local64_set(&event->hw.prev_count, 0); } + return static_call(x86_pmu_set_period)(event); +} + +static int intel_pmu_set_period(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event)) && + x86_pmu.set_topdown_event_period) + return x86_pmu.set_topdown_event_period(event); + return x86_perf_event_set_period(event); } +static u64 intel_pmu_update(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event)) && + x86_pmu.update_topdown_event) + return x86_pmu.update_topdown_event(event); + + return x86_perf_event_update(event); +} + static void intel_pmu_reset(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); @@ -4786,6 +4804,8 @@ static __initconst const struct x86_pmu intel_pmu = { .add = intel_pmu_add_event, .del = intel_pmu_del_event, .read = intel_pmu_read_event, + .set_period = intel_pmu_set_period, + .update = intel_pmu_update, .hw_config = intel_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, -- cgit v1.2.3 From 28f0f3c44b5c35be657a4f922dcdfb48285f4373 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:28:25 +0200 Subject: perf/x86: Change x86_pmu::limit_period signature In preparation for making it a static_call, change the signature. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.573713839@infradead.org --- arch/x86/events/amd/core.c | 8 +++----- arch/x86/events/core.c | 13 ++++++++----- arch/x86/events/intel/core.c | 19 ++++++++----------- arch/x86/events/perf_event.h | 2 +- 4 files changed, 20 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index bd99d2ae14c3..8b70237c33f7 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1224,16 +1224,14 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) +static void amd_pmu_limit_period(struct perf_event *event, s64 *left) { /* * Decrease period by the depth of the BRS feature to get the last N * taken branches and approximate the desired period */ - if (has_branch_stack(event) && left > x86_pmu.lbr_nr) - left -= x86_pmu.lbr_nr; - - return left; + if (has_branch_stack(event) && *left > x86_pmu.lbr_nr) + *left -= x86_pmu.lbr_nr; } static __initconst const struct x86_pmu amd_pmu = { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index b074e71bab21..1e90bc7ca36f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -621,8 +621,9 @@ int x86_pmu_hw_config(struct perf_event *event) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; if (event->attr.sample_period && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, event->attr.sample_period) > - event->attr.sample_period) + s64 left = event->attr.sample_period; + x86_pmu.limit_period(event, &left); + if (left > event->attr.sample_period) return -EINVAL; } @@ -1396,9 +1397,9 @@ int x86_perf_event_set_period(struct perf_event *event) left = x86_pmu.max_period; if (x86_pmu.limit_period) - left = x86_pmu.limit_period(event, left); + x86_pmu.limit_period(event, &left); - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; + this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, @@ -2677,7 +2678,9 @@ static int x86_pmu_check_period(struct perf_event *event, u64 value) return -EINVAL; if (value && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, value) > value) + s64 left = value; + x86_pmu.limit_period(event, &left); + if (left > value) return -EINVAL; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index feed732fdf57..92cc390590d1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4344,28 +4344,25 @@ static u8 adl_get_hybrid_cpu_type(void) * Therefore the effective (average) period matches the requested period, * despite coarser hardware granularity. */ -static u64 bdw_limit_period(struct perf_event *event, u64 left) +static void bdw_limit_period(struct perf_event *event, s64 *left) { if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xc0, .umask=0x01)) { - if (left < 128) - left = 128; - left &= ~0x3fULL; + if (*left < 128) + *left = 128; + *left &= ~0x3fULL; } - return left; } -static u64 nhm_limit_period(struct perf_event *event, u64 left) +static void nhm_limit_period(struct perf_event *event, s64 *left) { - return max(left, 32ULL); + *left = max(*left, 32LL); } -static u64 spr_limit_period(struct perf_event *event, u64 left) +static void spr_limit_period(struct perf_event *event, s64 *left) { if (event->attr.precise_ip == 3) - return max(left, 128ULL); - - return left; + *left = max(*left, 128LL); } PMU_FORMAT_ATTR(event, "config:0-7" ); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7ae1a6c5368c..e82d2d212534 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -781,7 +781,7 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; int perfctr_second_write; - u64 (*limit_period)(struct perf_event *event, u64 l); + void (*limit_period)(struct perf_event *event, s64 *l); /* PMI handler bits */ unsigned int late_ack :1, -- cgit v1.2.3 From 08b3068fab207e3c7d79799d434e1d648524cac6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:28:18 +0200 Subject: perf/x86: Add a x86_pmu::limit_period static_call Avoid a branch and indirect call. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.640658334@infradead.org --- arch/x86/events/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 1e90bc7ca36f..05830bb77d40 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -72,8 +72,9 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); -DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); -DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); +DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); +DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); +DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); @@ -1396,8 +1397,7 @@ int x86_perf_event_set_period(struct perf_event *event) if (left > x86_pmu.max_period) left = x86_pmu.max_period; - if (x86_pmu.limit_period) - x86_pmu.limit_period(event, &left); + static_call_cond(x86_pmu_limit_period)(event, &left); this_cpu_write(pmc_prev_left[idx], left); @@ -2024,6 +2024,7 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_set_period, x86_pmu.set_period); static_call_update(x86_pmu_update, x86_pmu.update); + static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); -- cgit v1.2.3 From 2368516731901c391826f7cf23516173193652fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 May 2022 16:41:25 +0200 Subject: perf/x86/intel: Remove x86_pmu::set_topdown_event_period Now that it is all internal to the intel driver, remove x86_pmu::set_topdown_event_period. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.706354189@infradead.org --- arch/x86/events/intel/core.c | 16 ++++++++++------ arch/x86/events/perf_event.h | 1 - 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 92cc390590d1..75400ed0eac3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2521,6 +2521,8 @@ static int adl_set_topdown_event_period(struct perf_event *event) return icl_set_topdown_event_period(event); } +DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period); + static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) { u32 val; @@ -2811,9 +2813,8 @@ int intel_pmu_save_and_restart(struct perf_event *event) static int intel_pmu_set_period(struct perf_event *event) { - if (unlikely(is_topdown_count(event)) && - x86_pmu.set_topdown_event_period) - return x86_pmu.set_topdown_event_period(event); + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_set_topdown_event_period)(event); return x86_perf_event_set_period(event); } @@ -6292,7 +6293,8 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 4; x86_pmu.update_topdown_event = icl_update_topdown_event; - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); pr_cont("Icelake events, "); name = "icelake"; break; @@ -6330,7 +6332,8 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 8; x86_pmu.update_topdown_event = icl_update_topdown_event; - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); pr_cont("Sapphire Rapids events, "); name = "sapphire_rapids"; break; @@ -6367,7 +6370,8 @@ __init int intel_pmu_init(void) x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.num_topdown_events = 8; x86_pmu.update_topdown_event = adl_update_topdown_event; - x86_pmu.set_topdown_event_period = adl_set_topdown_event_period; + static_call_update(intel_pmu_set_topdown_event_period, + &adl_set_topdown_event_period); x86_pmu.filter_match = intel_pmu_filter_match; x86_pmu.get_event_constraints = adl_get_event_constraints; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e82d2d212534..be27eadc40a3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -890,7 +890,6 @@ struct x86_pmu { */ int num_topdown_events; u64 (*update_topdown_event)(struct perf_event *event); - int (*set_topdown_event_period)(struct perf_event *event); /* * perf task context (i.e. struct perf_event_context::task_ctx_data) -- cgit v1.2.3 From 1acab2e01c9c5df00f2fddf3473014dea89dcb5f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 May 2022 17:02:05 +0200 Subject: perf/x86/intel: Remove x86_pmu::update_topdown_event Now that it is all internal to the intel driver, remove x86_pmu::update_topdown_event. Assumes that is_topdown_count(event) can only be true when the hardware has topdown stuff and the function is set. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.771635301@infradead.org --- arch/x86/events/intel/core.c | 22 ++++++++++++---------- arch/x86/events/perf_event.h | 1 - 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 75400ed0eac3..1e429e86a7f4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2673,6 +2673,7 @@ static u64 adl_update_topdown_event(struct perf_event *event) return icl_update_topdown_event(event); } +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); static void intel_pmu_read_topdown_event(struct perf_event *event) { @@ -2684,7 +2685,7 @@ static void intel_pmu_read_topdown_event(struct perf_event *event) return; perf_pmu_disable(event->pmu); - x86_pmu.update_topdown_event(event); + static_call(intel_pmu_update_topdown_event)(event); perf_pmu_enable(event->pmu); } @@ -2692,7 +2693,7 @@ static void intel_pmu_read_event(struct perf_event *event) { if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_auto_reload_read(event); - else if (is_topdown_count(event) && x86_pmu.update_topdown_event) + else if (is_topdown_count(event)) intel_pmu_read_topdown_event(event); else x86_perf_event_update(event); @@ -2821,9 +2822,8 @@ static int intel_pmu_set_period(struct perf_event *event) static u64 intel_pmu_update(struct perf_event *event) { - if (unlikely(is_topdown_count(event)) && - x86_pmu.update_topdown_event) - return x86_pmu.update_topdown_event(event); + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_update_topdown_event)(event); return x86_perf_event_update(event); } @@ -2990,8 +2990,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; - if (x86_pmu.update_topdown_event) - x86_pmu.update_topdown_event(NULL); + static_call(intel_pmu_update_topdown_event)(NULL); } /* @@ -6292,7 +6291,8 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 4; - x86_pmu.update_topdown_event = icl_update_topdown_event; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); static_call_update(intel_pmu_set_topdown_event_period, &icl_set_topdown_event_period); pr_cont("Icelake events, "); @@ -6331,7 +6331,8 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 8; - x86_pmu.update_topdown_event = icl_update_topdown_event; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); static_call_update(intel_pmu_set_topdown_event_period, &icl_set_topdown_event_period); pr_cont("Sapphire Rapids events, "); @@ -6369,7 +6370,8 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_adl(); x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.num_topdown_events = 8; - x86_pmu.update_topdown_event = adl_update_topdown_event; + static_call_update(intel_pmu_update_topdown_event, + &adl_update_topdown_event); static_call_update(intel_pmu_set_topdown_event_period, &adl_set_topdown_event_period); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index be27eadc40a3..386ebfa416da 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -889,7 +889,6 @@ struct x86_pmu { * Intel perf metrics */ int num_topdown_events; - u64 (*update_topdown_event)(struct perf_event *event); /* * perf task context (i.e. struct perf_event_context::task_ctx_data) -- cgit v1.2.3 From dbf4e792beadafc684ef455453c613ff182c7723 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 May 2022 15:38:43 +0200 Subject: perf/x86/p4: Remove perfctr_second_write quirk Now that we have a x86_pmu::set_period() method, use it to remove the perfctr_second_write quirk from the generic code. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.839502514@infradead.org --- arch/x86/events/core.c | 12 +----------- arch/x86/events/intel/p4.c | 37 +++++++++++++++++++++++++++---------- arch/x86/events/perf_event.h | 2 +- 3 files changed, 29 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 05830bb77d40..b30b8bbcd1e2 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1356,7 +1356,7 @@ static void x86_pmu_enable(struct pmu *pmu) static_call(x86_pmu_enable_all)(added); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); +DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -1416,16 +1416,6 @@ int x86_perf_event_set_period(struct perf_event *event) if (is_counter_pair(hwc)) wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); - /* - * Due to erratum on certan cpu we need - * a second write to be sure the register - * is updated properly - */ - if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base, - (u64)(-left) & x86_pmu.cntval_mask); - } - perf_event_update_userpage(event); return ret; diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 7951a5dc73b6..03bbcc2fa2ff 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -1006,6 +1006,29 @@ static void p4_pmu_enable_all(int added) } } +static int p4_pmu_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = this_cpu_read(pmc_prev_left[hwc->idx]); + int ret; + + ret = x86_perf_event_set_period(event); + + if (hwc->event_base) { + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + } + + return ret; +} + static int p4_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; @@ -1044,7 +1067,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) /* event overflow for sure */ perf_sample_data_init(&data, 0, hwc->last_period); - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; @@ -1316,6 +1339,9 @@ static __initconst const struct x86_pmu p4_pmu = { .enable_all = p4_pmu_enable_all, .enable = p4_pmu_enable_event, .disable = p4_pmu_disable_event, + + .set_period = p4_pmu_set_period, + .eventsel = MSR_P4_BPU_CCCR0, .perfctr = MSR_P4_BPU_PERFCTR0, .event_map = p4_pmu_event_map, @@ -1334,15 +1360,6 @@ static __initconst const struct x86_pmu p4_pmu = { .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, - /* - * This handles erratum N15 in intel doc 249199-029, - * the counter may not be updated correctly on write - * so we need a second write operation to do the trick - * (the official workaround didn't work) - * - * the former idea is taken from OProfile code - */ - .perfctr_second_write = 1, .format_attrs = intel_p4_formats_attr, }; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 386ebfa416da..20c2ee2128fe 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -780,7 +780,6 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; - int perfctr_second_write; void (*limit_period)(struct perf_event *event, s64 *l); /* PMI handler bits */ @@ -1060,6 +1059,7 @@ static inline bool x86_pmu_has_lbr_callstack(void) } DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); +DECLARE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); int x86_perf_event_set_period(struct perf_event *event); -- cgit v1.2.3 From fae9ebde9696385fa2e993e752cf68d9781f3ea0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 4 Aug 2022 07:07:29 -0700 Subject: perf/x86/intel: Optimize FIXED_CTR_CTRL access All the fixed counters share a fixed control register. The current perf reads and re-writes the fixed control register for each fixed counter disable/enable, which is unnecessary. When changing the fixed control register, the entire PMU must be disabled via the global control register. The changing cannot be taken effect until the entire PMU is re-enabled. Only updating the fixed control register once right before the entire PMU re-enabling is enough. The read of the fixed control register is not necessary either. The value can be cached in the per CPU cpu_hw_events. Test results: Counting all the fixed counters with the perf bench sched pipe as below on a SPR machine. $perf stat -e cycles,instructions,ref-cycles,slots --no-inherit -- taskset -c 1 perf bench sched pipe The Total elapsed time reduces from 5.36s (without the patch) to 4.99s (with the patch), which is ~6.9% improvement. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220804140729.2951259-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 22 +++++++++++++--------- arch/x86/events/perf_event.h | 4 ++++ 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1e429e86a7f4..7f4e7e6b45f0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2190,6 +2190,12 @@ static void __intel_pmu_enable_all(int added, bool pmi) u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); intel_pmu_lbr_enable_all(pmi); + + if (cpuc->fixed_ctrl_val != cpuc->active_fixed_ctrl_val) { + wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val); + cpuc->active_fixed_ctrl_val = cpuc->fixed_ctrl_val; + } + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, intel_ctrl & ~cpuc->intel_ctrl_guest_mask); @@ -2407,9 +2413,10 @@ static inline void intel_clear_masks(struct perf_event *event, int idx) static void intel_pmu_disable_fixed(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 ctrl_val, mask; int idx = hwc->idx; + u64 mask; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2426,9 +2433,7 @@ static void intel_pmu_disable_fixed(struct perf_event *event) intel_clear_masks(event, idx); mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4); - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - wrmsrl(hwc->config_base, ctrl_val); + cpuc->fixed_ctrl_val &= ~mask; } static void intel_pmu_disable_event(struct perf_event *event) @@ -2701,8 +2706,9 @@ static void intel_pmu_read_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 ctrl_val, mask, bits = 0; + u64 mask, bits = 0; int idx = hwc->idx; if (is_topdown_idx(idx)) { @@ -2746,10 +2752,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); } - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - wrmsrl(hwc->config_base, ctrl_val); + cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val |= bits; } static void intel_pmu_enable_event(struct perf_event *event) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 20c2ee2128fe..371967054871 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -270,6 +270,10 @@ struct cpu_hw_events { u64 active_pebs_data_cfg; int pebs_record_size; + /* Intel Fixed counter configuration */ + u64 fixed_ctrl_val; + u64 active_fixed_ctrl_val; + /* * Intel LBR bits */ -- cgit v1.2.3 From 3749d33e510c3dc695b3a5886b706310890d7ebd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:02 -0700 Subject: perf: Use sample_flags for callchain So that it can call perf_callchain() only if needed. Historically it used __PERF_SAMPLE_CALLCHAIN_EARLY but we can do that with sample_flags in the struct perf_sample_data. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-1-namhyung@kernel.org --- arch/x86/events/amd/ibs.c | 4 +++- arch/x86/events/intel/ds.c | 8 ++++++-- kernel/events/core.c | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c251bc44c088..dab094166693 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -798,8 +798,10 @@ fail: * recorded as part of interrupt regs. Thus we need to use rip from * interrupt regs while unwinding call stack. */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { data.callchain = perf_callchain(event, iregs); + data.sample_flags |= PERF_SAMPLE_CALLCHAIN; + } throttle = perf_event_overflow(event, &data, ®s); out: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a5275c235c2a..4ba6ab6d0d92 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1546,8 +1546,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } /* * We use the interrupt regs as a base because the PEBS record does not @@ -1719,8 +1721,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } *regs = *iregs; /* The ip in basic is EventingIP */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e90e454b995..c98ecf3e09ba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7320,7 +7320,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) data->callchain = perf_callchain(event, regs); size += data->callchain->nr; -- cgit v1.2.3 From b4e12b2d70fd9eccdb3cef8015dc1788ca38e3fd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:04 -0700 Subject: perf: Kill __PERF_SAMPLE_CALLCHAIN_EARLY There's no in-tree user anymore. Let's get rid of it. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-3-namhyung@kernel.org --- arch/x86/events/amd/ibs.c | 10 ---------- arch/x86/events/intel/core.c | 3 --- include/uapi/linux/perf_event.h | 2 -- 3 files changed, 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index dab094166693..ce5720bfb350 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -300,16 +300,6 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; - /* - * rip recorded by IbsOpRip will not be consistent with rsp and rbp - * recorded as part of interrupt regs. Thus we need to use rip from - * interrupt regs while unwinding call stack. Setting _EARLY flag - * makes sure we unwind call-stack before perf sample rip is set to - * IbsOpRip. - */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; - return 0; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7f4e7e6b45f0..b16c91ac9219 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3868,9 +3868,6 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index dca16582885f..e639c74cf5fb 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -164,8 +164,6 @@ enum perf_event_sample_format { PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ - - __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) -- cgit v1.2.3 From 7b084630153152239d84990ac4540c2dd360186f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:31 -0700 Subject: perf: Use sample_flags for addr Use the new sample_flags to indicate whether the addr field is filled by the PMU driver. As most PMU drivers pass 0, it can set the flag only if it has a non-zero value. And use 0 in perf_sample_output() if it's not filled already. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-1-namhyung@kernel.org --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 8 ++++++-- kernel/events/core.c | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4ba6ab6d0d92..d2e9ff16f6ed 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1621,8 +1621,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && - x86_pmu.intel_cap.pebs_format >= 1) + x86_pmu.intel_cap.pebs_format >= 1) { data->addr = pebs->dla; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ @@ -1783,8 +1785,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_DATA_SRC; } - if (sample_type & PERF_SAMPLE_ADDR_TYPE) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { data->addr = meminfo->address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 368bdc4f563f..f4a13579b0e8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - u64 addr; struct perf_raw_record *raw; u64 period; @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; + u64 addr; u64 type; u64 ip; @@ -1079,9 +1079,13 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = 0; - data->addr = addr; data->raw = NULL; data->period = period; + + if (addr) { + data->addr = addr; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index c07e9a3ea94c..a91f74db9fe9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,11 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) data->txn = 0; + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { + if (filtered_sample_type & PERF_SAMPLE_ADDR) + data->addr = 0; + } + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3 From 838d9bb62d132ec3baf1b5aba2e95ef9a7a9a3cd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:32 -0700 Subject: perf: Use sample_flags for raw_data Use the new sample_flags to indicate whether the raw data field is filled by the PMU driver. Although it could check with the NULL, follow the same rule with other fields. Remove the raw field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-2-namhyung@kernel.org --- arch/s390/kernel/perf_cpum_cf.c | 1 + arch/s390/kernel/perf_pai_crypto.c | 1 + arch/x86/events/amd/ibs.c | 1 + include/linux/perf_event.h | 5 ++--- kernel/events/core.c | 3 ++- 5 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f7dd3c849e68..f043a7ff220b 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -664,6 +664,7 @@ static int cfdiag_push_sample(struct perf_event *event, raw.frag.data = cpuhw->stop; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index b38b4ae01589..6826e2a69a21 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -366,6 +366,7 @@ static int paicrypt_push_sample(void) raw.frag.data = cpump->save; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ce5720bfb350..c29a006954c7 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -781,6 +781,7 @@ fail: }, }; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f4a13579b0e8..e9b151cde491 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - struct perf_raw_record *raw; u64 period; /* @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_mem_data_src data_src; u64 txn; u64 addr; + struct perf_raw_record *raw; u64 type; u64 ip; @@ -1078,8 +1078,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ - data->sample_flags = 0; - data->raw = NULL; + data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; if (addr) { diff --git a/kernel/events/core.c b/kernel/events/core.c index a91f74db9fe9..04e19a857d4b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7332,7 +7332,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct perf_raw_record *raw = data->raw; int size; - if (raw) { + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; @@ -7348,6 +7348,7 @@ void perf_prepare_sample(struct perf_event_header *header, frag->pad = raw->size - sum; } else { size = sizeof(u64); + data->raw = NULL; } header->size += size; -- cgit v1.2.3 From 50b0c97bf00e4815aee09cace28b940ebb060e69 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:28 -0700 Subject: perf/x86: Add new Raptor Lake S support From PMU's perspective, the new Raptor Lake S is the same as the other of hybrid {ALDER,RAPTOP}LAKE. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index b2d8def4d03c..3939debb27e7 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6344,6 +6344,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: /* * Alder Lake has 2 types of CPU, core and atom. * -- cgit v1.2.3 From 193c888b7ffe4da97346950c0e98dd77cc629f24 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:29 -0700 Subject: perf/x86/msr: Add new Raptor Lake S support The same as the other hybrid {ALDER,RAPTOP}LAKE, the new Raptor Lake S also support PPERF and SMI_COUNT MSRs. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-2-kan.liang@linux.intel.com --- arch/x86/events/msr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index ac542f98c070..ecced3a52668 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -106,6 +106,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; -- cgit v1.2.3 From d12940d2ead51c6978e7d38b2abf12b833270b2a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:30 -0700 Subject: perf/x86/cstate: Add new Raptor Lake S support From the perspective of Intel cstate residency counters, the new Raptor Lake S is the same as the other hybrid {ALDER,RAPTOP}LAKE. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-3-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 8ec23f47fee9..a2834bc93149 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -685,6 +685,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); -- cgit v1.2.3 From e04a1607c9c3c0e2dc48715aeb570f2581f514bc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:31 -0700 Subject: perf/x86/uncore: Add new Raptor Lake S support From the perspective of the uncore PMU, the new Raptor Lake S is the same as the other hybrid {ALDER,RAPTOP}LAKE. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-4-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index db6c31bca809..6f1ccc57a692 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1831,6 +1831,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, -- cgit v1.2.3 From 610c238041fbc682936d34132362a54a802600fe Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:52 +0530 Subject: perf/x86/amd: Add IBS OP_DATA2 DataSrc bit definitions IBS_OP_DATA2 DataSrc provides detail about location of the data being accessed from by load ops. Define macros for legacy and extended DataSrc values. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-3-ravi.bangoria@amd.com --- arch/x86/include/asm/amd-ibs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index f3eb098d63d4..cb2a5e113daa 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -6,6 +6,22 @@ #include +/* IBS_OP_DATA2 DataSrc */ +#define IBS_DATA_SRC_LOC_CACHE 2 +#define IBS_DATA_SRC_DRAM 3 +#define IBS_DATA_SRC_REM_CACHE 4 +#define IBS_DATA_SRC_IO 7 + +/* IBS_OP_DATA2 DataSrc Extension */ +#define IBS_DATA_SRC_EXT_LOC_CACHE 1 +#define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE 2 +#define IBS_DATA_SRC_EXT_DRAM 3 +#define IBS_DATA_SRC_EXT_FAR_CCX_CACHE 5 +#define IBS_DATA_SRC_EXT_PMEM 6 +#define IBS_DATA_SRC_EXT_IO 7 +#define IBS_DATA_SRC_EXT_EXT_MEM 8 +#define IBS_DATA_SRC_EXT_PEER_AGENT_MEM 12 + /* * IBS Hardware MSRs */ -- cgit v1.2.3 From 7c10dd0a88b1cc6ae4637fffb494c5e080027eb6 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:53 +0530 Subject: perf/x86/amd: Support PERF_SAMPLE_DATA_SRC struct perf_mem_data_src is used to pass arch specific memory access details into generic form. These details gets consumed by tools like perf mem and c2c. IBS tagged load/store sample provides most of the information needed for these tools. Add a logic to convert IBS specific raw data into perf_mem_data_src. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-4-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 318 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 312 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c29a006954c7..e20caa5cf02f 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -678,6 +678,312 @@ static struct perf_ibs perf_ibs_op = { .get_count = get_ibs_op_count, }; +static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_op = PERF_MEM_OP_NA; + + if (op_data3->ld_op) + data_src->mem_op = PERF_MEM_OP_LOAD; + else if (op_data3->st_op) + data_src->mem_op = PERF_MEM_OP_STORE; +} + +/* + * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has + * more fine granular DataSrc encodings. Others have coarse. + */ +static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) +{ + if (ibs_caps & IBS_CAPS_ZEN4) + return (op_data2->data_src_hi << 3) | op_data2->data_src_lo; + + return op_data2->data_src_lo; +} + +static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src = perf_ibs_data_src(op_data2); + + data_src->mem_lvl = 0; + + /* + * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached + * memory accesses. So, check DcUcMemAcc bit early. + */ + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT; + return; + } + + /* L1 Hit */ + if (op_data3->dc_miss == 0) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + return; + } + + /* L2 Hit */ + if (op_data3->l2_miss == 0) { + /* Erratum #1293 */ + if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* + * OP_DATA2 is valid only for load ops. Skip all checks which + * uses OP_DATA2[DataSrc]. + */ + if (data_src->mem_op != PERF_MEM_OP_LOAD) + goto check_mab; + + /* L3 Hit */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 | + PERF_MEM_LVL_HIT; + return; + } + } + + /* A peer cache in a near CCX */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; + return; + } + + /* A peer cache in a far CCX */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* DRAM */ + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) { + if (op_data2->rmt_node == 0) + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; + else + data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; + return; + } + + /* PMEM */ + if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* Extension Memory */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* IO */ + if (ibs_data_src == IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_IO; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + +check_mab: + /* + * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding + * DC misses. However, such data may come from any level in mem + * hierarchy. IBS provides detail about both MAB as well as actual + * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set + * MAB only when IBS fails to provide DataSrc. + */ + if (op_data3->dc_miss_no_mab_alloc) { + data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; + return; + } + + data_src->mem_lvl = PERF_MEM_LVL_NA; +} + +static bool perf_ibs_cache_hit_st_valid(void) +{ + /* 0: Uninitialized, 1: Valid, -1: Invalid */ + static int cache_hit_st_valid; + + if (unlikely(!cache_hit_st_valid)) { + if (boot_cpu_data.x86 == 0x19 && + (boot_cpu_data.x86_model <= 0xF || + (boot_cpu_data.x86_model >= 0x20 && + boot_cpu_data.x86_model <= 0x5F))) { + cache_hit_st_valid = -1; + } else { + cache_hit_st_valid = 1; + } + } + + return cache_hit_st_valid == 1; +} + +static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src; + + data_src->mem_snoop = PERF_MEM_SNOOP_NA; + + if (!perf_ibs_cache_hit_st_valid() || + data_src->mem_op != PERF_MEM_OP_LOAD || + data_src->mem_lvl & PERF_MEM_LVL_L1 || + data_src->mem_lvl & PERF_MEM_LVL_L2 || + op_data2->cache_hit_st) + return; + + ibs_data_src = perf_ibs_data_src(op_data2); + + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } +} + +static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_dtlb = PERF_MEM_TLB_NA; + + if (!op_data3->dc_lin_addr_valid) + return; + + if (!op_data3->dc_l1tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; + return; + } + + if (!op_data3->dc_l2tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT; + return; + } + + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS; +} + +static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_lock = PERF_MEM_LOCK_NA; + + if (op_data3->dc_locked_op) + data_src->mem_lock = PERF_MEM_LOCK_LOCKED; +} + +#define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) + +static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, + struct perf_sample_data *data, + union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3) +{ + perf_ibs_get_mem_lvl(op_data2, op_data3, data); + perf_ibs_get_mem_snoop(op_data2, data); + perf_ibs_get_tlb_lvl(op_data3, data); + perf_ibs_get_mem_lock(op_data3, data); +} + +static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, + union ibs_op_data3 *op_data3) +{ + __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)]; + + /* Erratum #1293 */ + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF && + (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + /* + * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. + * DataSrc=0 is 'No valid status' and RmtNode is invalid when + * DataSrc=0. + */ + val = 0; + } + return val; +} + +static void perf_ibs_parse_ld_st_data(__u64 sample_type, + struct perf_ibs_data *ibs_data, + struct perf_sample_data *data) +{ + union ibs_op_data3 op_data3; + union ibs_op_data2 op_data2; + + data->data_src.val = PERF_MEM_NA; + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + + perf_ibs_get_mem_op(&op_data3, data); + if (data->data_src.mem_op != PERF_MEM_OP_LOAD && + data->data_src.mem_op != PERF_MEM_OP_STORE) + return; + + op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3); + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, + int check_rip) +{ + if (sample_type & PERF_SAMPLE_RAW || + (perf_ibs == &perf_ibs_op && + sample_type & PERF_SAMPLE_DATA_SRC)) + return perf_ibs->offset_max; + else if (check_rip) + return 3; + return 1; +} + static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); @@ -725,12 +1031,9 @@ fail: size = 1; offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - if (event->attr.sample_type & PERF_SAMPLE_RAW) - offset_max = perf_ibs->offset_max; - else if (check_rip) - offset_max = 3; - else - offset_max = 1; + + offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + do { rdmsrl(msr + offset, *buf++); size++; @@ -784,6 +1087,9 @@ fail: data.sample_flags |= PERF_SAMPLE_RAW; } + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + /* * rip recorded by IbsOpRip will not be consistent with rsp and rbp * recorded as part of interrupt regs. Thus we need to use rip from -- cgit v1.2.3 From 6b2ae4952ef8ac23b467bc10776404092b581143 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:54 +0530 Subject: perf/x86/amd: Support PERF_SAMPLE_{WEIGHT|WEIGHT_STRUCT} IbsDcMissLat indicates the number of clock cycles from when a miss is detected in the data cache to when the data was delivered to the core. Similarly, IbsTagToRetCtr provides number of cycles from when the op was tagged to when the op was retired. Consider these fields for sample->weight. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-5-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index e20caa5cf02f..d883694e0fd4 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -955,6 +955,7 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, { union ibs_op_data3 op_data3; union ibs_op_data2 op_data2; + union ibs_op_data op_data; data->data_src.val = PERF_MEM_NA; op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; @@ -970,6 +971,19 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); data->sample_flags |= PERF_SAMPLE_DATA_SRC; } + + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss && + data->data_src.mem_op == PERF_MEM_OP_LOAD) { + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { + data->weight.var1_dw = op_data3.dc_miss_lat; + data->weight.var2_w = op_data.tag_to_ret_ctr; + } else if (sample_type & PERF_SAMPLE_WEIGHT) { + data->weight.full = op_data3.dc_miss_lat; + } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -977,7 +991,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, { if (sample_type & PERF_SAMPLE_RAW || (perf_ibs == &perf_ibs_op && - sample_type & PERF_SAMPLE_DATA_SRC)) + (sample_type & PERF_SAMPLE_DATA_SRC || + sample_type & PERF_SAMPLE_WEIGHT_TYPE))) return perf_ibs->offset_max; else if (check_rip) return 3; -- cgit v1.2.3 From cb2bb85f7ed8740ab5fc06bbec386faa39ba44ef Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:55 +0530 Subject: perf/x86/amd: Support PERF_SAMPLE_ADDR IBS_DC_LINADDR provides the linear data address for the tagged load/ store operation. Populate perf sample address using it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-6-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index d883694e0fd4..0ad49105c154 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -984,6 +984,11 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, } data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } + + if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) { + data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -992,7 +997,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, if (sample_type & PERF_SAMPLE_RAW || (perf_ibs == &perf_ibs_op && (sample_type & PERF_SAMPLE_DATA_SRC || - sample_type & PERF_SAMPLE_WEIGHT_TYPE))) + sample_type & PERF_SAMPLE_WEIGHT_TYPE || + sample_type & PERF_SAMPLE_ADDR))) return perf_ibs->offset_max; else if (check_rip) return 3; -- cgit v1.2.3 From 5b26af6d2b7854639ddf893366bbca7e74fa7c54 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:56 +0530 Subject: perf/x86/amd: Support PERF_SAMPLE_PHY_ADDR IBS_DC_PHYSADDR provides the physical data address for the tagged load/ store operation. Populate perf sample physical address using it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-7-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 8 +++++++- kernel/events/core.c | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 0ad49105c154..3271735f0070 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -989,6 +989,11 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; data->sample_flags |= PERF_SAMPLE_ADDR; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { + data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -998,7 +1003,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, (perf_ibs == &perf_ibs_op && (sample_type & PERF_SAMPLE_DATA_SRC || sample_type & PERF_SAMPLE_WEIGHT_TYPE || - sample_type & PERF_SAMPLE_ADDR))) + sample_type & PERF_SAMPLE_ADDR || + sample_type & PERF_SAMPLE_PHYS_ADDR))) return perf_ibs->offset_max; else if (check_rip) return 3; diff --git a/kernel/events/core.c b/kernel/events/core.c index e1ffdb861b53..49bc3b5e6c8a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7435,7 +7435,8 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR) + if (sample_type & PERF_SAMPLE_PHYS_ADDR && + filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); #ifdef CONFIG_CGROUP_PERF -- cgit v1.2.3 From 117ceeb1f4f87331e45a77e71f18303d15ec882e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 28 Sep 2022 11:40:42 -0700 Subject: perf/x86/utils: Fix uninitialized var in get_branch_type() offset is passed as a pointer and on certain call path is not set by the function. If the caller does not re-initialize offset between calls, value could be inherited between calls. Prevent this by initializing offset on each call. This impacts the code in amd_pmu_lbr_filter() which does: for(i=0; ...) { ret = get_branch_type_fused(..., &offset); if (offset) lbr_entries[i].from += offset; } Fixes: df3e9612f758 ("perf/x86: Make branch classifier fusion-aware") Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20220928184043.408364-2-eranian@google.com --- arch/x86/events/utils.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index 5f5617afde79..76b1f8bb0fd5 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -94,6 +94,10 @@ static int get_branch_type(unsigned long from, unsigned long to, int abort, u8 buf[MAX_INSN_SIZE]; int is64 = 0; + /* make sure we initialize offset */ + if (offset) + *offset = 0; + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; -- cgit v1.2.3 From 3f9a1b3591003b122a6ea2d69f89a0fd96ec58b9 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 28 Sep 2022 11:40:43 -0700 Subject: perf/x86/amd/lbr: Adjust LBR regardless of filtering In case of fused compare and taken branch instructions, the AMD LBR points to the compare instruction instead of the branch. Users of LBR usually expects the from address to point to a branch instruction. The kernel has code to adjust the from address via get_branch_type_fused(). However this correction is only applied when a branch filter is applied. That means that if no filter is present, the quality of the data is lower. Fix the problem by applying the adjustment regardless of the filter setting, bringing the AMD LBR to the same level as other LBR implementations. Fixes: 245268c19f70 ("perf/x86/amd/lbr: Use fusion-aware branch classifier") Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20220928184043.408364-3-eranian@google.com --- arch/x86/events/amd/lbr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 2e1c1573efe7..38a75216c12c 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -99,12 +99,13 @@ static void amd_pmu_lbr_filter(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int br_sel = cpuc->br_sel, offset, type, i, j; bool compress = false; + bool fused_only = false; u64 from, to; /* If sampling all branches, there is nothing to filter */ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) - return; + fused_only = true; for (i = 0; i < cpuc->lbr_stack.nr; i++) { from = cpuc->lbr_entries[i].from; @@ -116,8 +117,11 @@ static void amd_pmu_lbr_filter(void) * fusion where it points to an instruction preceding the * actual branch */ - if (offset) + if (offset) { cpuc->lbr_entries[i].from += offset; + if (fused_only) + continue; + } /* If type does not correspond, then discard */ if (type == X86_BR_NONE || (br_sel & type) != type) { -- cgit v1.2.3