diff options
author | Linus Torvalds | 2020-10-12 14:14:35 -0700 |
---|---|---|
committer | Linus Torvalds | 2020-10-12 14:14:35 -0700 |
commit | 3bff6112c80cecb76af5fe485506f96e8adb6122 (patch) | |
tree | 3a2507f01d3d2d3296742ae1dc59f4ea40d12705 | |
parent | dd502a81077a5f3b3e19fa9a1accffdcab5ad5bc (diff) | |
parent | f91072ed1b7283b13ca57fcfbece5a3b92726143 (diff) |
Merge tag 'perf-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar:
"x86 Intel updates:
- Add Jasper Lake support
- Add support for TopDown metrics on Ice Lake
- Fix Ice Lake & Tiger Lake uncore support, add Snow Ridge support
- Add a PCI sub driver to support uncore PMUs where the PCI resources
have been claimed already - extending the range of supported
systems.
x86 AMD updates:
- Restore 'perf stat -a' behaviour to program the uncore PMU to count
all CPU threads.
- Fix setting the proper count when sampling Large Increment per
Cycle events / 'paired' events.
- Fix IBS Fetch sampling on F17h and some other IBS fine tuning,
greatly reducing the number of interrupts when large sample periods
are specified.
- Extends Family 17h RAPL support to also work on compatible F19h
machines.
Core code updates:
- Fix race in perf_mmap_close()
- Add PERF_EV_CAP_SIBLING, to denote that sibling events should be
closed if the leader is removed.
- Smaller fixes and updates"
* tag 'perf-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits)
perf/core: Fix race in the perf_mmap_close() function
perf/x86: Fix n_metric for cancelled txn
perf/x86: Fix n_pair for cancelled txn
x86/events/amd/iommu: Fix sizeof mismatch
perf/x86/intel: Check perf metrics feature for each CPU
perf/x86/intel: Fix Ice Lake event constraint table
perf/x86/intel/uncore: Fix the scale of the IMC free-running events
perf/x86/intel/uncore: Fix for iio mapping on Skylake Server
perf/x86/msr: Add Jasper Lake support
perf/x86/intel: Add Jasper Lake support
perf/x86/intel/uncore: Reduce the number of CBOX counters
perf/x86/intel/uncore: Update Ice Lake uncore units
perf/x86/intel/uncore: Split the Ice Lake and Tiger Lake MSR uncore support
perf/x86/intel/uncore: Support PCIe3 unit on Snow Ridge
perf/x86/intel/uncore: Generic support for the PCI sub driver
perf/x86/intel/uncore: Factor out uncore_pci_pmu_unregister()
perf/x86/intel/uncore: Factor out uncore_pci_pmu_register()
perf/x86/intel/uncore: Factor out uncore_pci_find_dev_pmu()
perf/x86/intel/uncore: Factor out uncore_pci_get_dev_die_info()
perf/amd/uncore: Inform the user how many counters each uncore PMU has
...
-rw-r--r-- | arch/x86/events/amd/ibs.c | 93 | ||||
-rw-r--r-- | arch/x86/events/amd/iommu.c | 2 | ||||
-rw-r--r-- | arch/x86/events/amd/uncore.c | 186 | ||||
-rw-r--r-- | arch/x86/events/core.c | 91 | ||||
-rw-r--r-- | arch/x86/events/intel/core.c | 364 | ||||
-rw-r--r-- | arch/x86/events/intel/ds.c | 32 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore.c | 275 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore.h | 2 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore_snb.c | 45 | ||||
-rw-r--r-- | arch/x86/events/intel/uncore_snbep.c | 72 | ||||
-rw-r--r-- | arch/x86/events/msr.c | 1 | ||||
-rw-r--r-- | arch/x86/events/perf_event.h | 54 | ||||
-rw-r--r-- | arch/x86/events/rapl.c | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/msr-index.h | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/perf_event.h | 98 | ||||
-rw-r--r-- | include/linux/perf_event.h | 34 | ||||
-rw-r--r-- | kernel/events/core.c | 121 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 7 |
18 files changed, 1182 insertions, 300 deletions
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 26c36357c4c9..40669eac9d6d 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -89,6 +89,7 @@ struct perf_ibs { u64 max_period; unsigned long offset_mask[1]; int offset_max; + unsigned int fetch_count_reset_broken : 1; struct cpu_perf_ibs __percpu *pcpu; struct attribute **format_attrs; @@ -334,11 +335,18 @@ static u64 get_ibs_op_count(u64 config) { u64 count = 0; - if (config & IBS_OP_VAL) - count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ - - if (ibs_caps & IBS_CAPS_RDWROPCNT) - count += (config & IBS_OP_CUR_CNT) >> 32; + /* + * If the internal 27-bit counter rolled over, the count is MaxCnt + * and the lower 7 bits of CurCnt are randomized. + * Otherwise CurCnt has the full 27-bit current counter value. + */ + if (config & IBS_OP_VAL) { + count = (config & IBS_OP_MAX_CNT) << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + count += config & IBS_OP_MAX_CNT_EXT_MASK; + } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { + count = (config & IBS_OP_CUR_CNT) >> 32; + } return count; } @@ -363,7 +371,12 @@ perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 config) { - wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); + u64 tmp = hwc->config | config; + + if (perf_ibs->fetch_count_reset_broken) + wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask); + + wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask); } /* @@ -394,7 +407,7 @@ static void perf_ibs_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 period; + u64 period, config = 0; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -403,13 +416,19 @@ static void perf_ibs_start(struct perf_event *event, int flags) hwc->state = 0; perf_ibs_set_period(perf_ibs, hwc, &period); + if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { + config |= period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + config |= period >> 4; + /* * Set STARTED before enabling the hardware, such that a subsequent NMI * must observe it. */ set_bit(IBS_STARTED, pcpu->state); clear_bit(IBS_STOPPING, pcpu->state); - perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + perf_ibs_enable_event(perf_ibs, hwc, config); perf_event_update_userpage(event); } @@ -577,7 +596,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_ibs_data ibs_data; int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; - u64 *buf, *config, period; + u64 *buf, *config, period, new_config = 0; if (!test_bit(IBS_STARTED, pcpu->state)) { fail: @@ -626,18 +645,24 @@ fail: perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + /* + * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately + * depending on their availability. + * Can't add to offset_max as they are staggered + */ if (event->attr.sample_type & PERF_SAMPLE_RAW) { - /* - * Read IbsBrTarget and IbsOpData4 separately - * depending on their availability. - * Can't add to offset_max as they are staggered - */ - if (ibs_caps & IBS_CAPS_BRNTRGT) { - rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); - size++; + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_BRNTRGT) { + rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + size++; + } + if (ibs_caps & IBS_CAPS_OPDATA4) { + rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + size++; + } } - if (ibs_caps & IBS_CAPS_OPDATA4) { - rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) { + rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++); size++; } } @@ -666,13 +691,17 @@ out: if (throttle) { perf_ibs_stop(event, 0); } else { - period >>= 4; - - if ((ibs_caps & IBS_CAPS_RDWROPCNT) && - (*config & IBS_OP_CNT_CTL)) - period |= *config & IBS_OP_CUR_CNT_RAND; + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + new_config = period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL)) + new_config |= *config & IBS_OP_CUR_CNT_RAND; + } + new_config |= period >> 4; - perf_ibs_enable_event(perf_ibs, hwc, period); + perf_ibs_enable_event(perf_ibs, hwc, new_config); } perf_event_update_userpage(event); @@ -733,12 +762,26 @@ static __init void perf_event_ibs_init(void) { struct attribute **attr = ibs_op_format_attrs; + /* + * Some chips fail to reset the fetch count when it is written; instead + * they need a 0-1 transition of IbsFetchEn. + */ + if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18) + perf_ibs_fetch.fetch_count_reset_broken = 1; + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); if (ibs_caps & IBS_CAPS_OPCNT) { perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; *attr++ = &format_attr_cnt_ctl.attr; } + + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + } + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index fb616203ce42..be50ef8572cc 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -379,7 +379,7 @@ static __init int _init_events_attrs(void) while (amd_iommu_v2_event_descs[i].attr.attr.name) i++; - attrs = kcalloc(i + 1, sizeof(struct attribute **), GFP_KERNEL); + attrs = kcalloc(i + 1, sizeof(*attrs), GFP_KERNEL); if (!attrs) return -ENOMEM; diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 76400c052b0e..7f014d450bc2 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -181,28 +181,28 @@ static void amd_uncore_del(struct perf_event *event, int flags) } /* - * Convert logical CPU number to L3 PMC Config ThreadMask format + * Return a full thread and slice mask unless user + * has provided them */ -static u64 l3_thread_slice_mask(int cpu) +static u64 l3_thread_slice_mask(u64 config) { - u64 thread_mask, core = topology_core_id(cpu); - unsigned int shift, thread = 0; + if (boot_cpu_data.x86 <= 0x18) + return ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) | + ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK); - if (topology_smt_supported() && !topology_is_primary_thread(cpu)) - thread = 1; - - if (boot_cpu_data.x86 <= 0x18) { - shift = AMD64_L3_THREAD_SHIFT + 2 * (core % 4) + thread; - thread_mask = BIT_ULL(shift); - - return AMD64_L3_SLICE_MASK | thread_mask; - } - - core = (core << AMD64_L3_COREID_SHIFT) & AMD64_L3_COREID_MASK; - shift = AMD64_L3_THREAD_SHIFT + thread; - thread_mask = BIT_ULL(shift); + /* + * If the user doesn't specify a threadmask, they're not trying to + * count core 0, so we enable all cores & threads. + * We'll also assume that they want to count slice 0 if they specify + * a threadmask and leave sliceid and enallslices unpopulated. + */ + if (!(config & AMD64_L3_F19H_THREAD_MASK)) + return AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_EN_ALL_CORES; - return AMD64_L3_EN_ALL_SLICES | core | thread_mask; + return config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK | + AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_COREID_MASK); } static int amd_uncore_event_init(struct perf_event *event) @@ -232,7 +232,7 @@ static int amd_uncore_event_init(struct perf_event *event) * For other events, the two fields do not affect the count. */ if (l3_mask && is_llc_event(event)) - hwc->config |= l3_thread_slice_mask(event->cpu); + hwc->config |= l3_thread_slice_mask(event->attr.config); uncore = event_to_amd_uncore(event); if (!uncore) @@ -274,47 +274,72 @@ static struct attribute_group amd_uncore_attr_group = { .attrs = amd_uncore_attrs, }; -/* - * Similar to PMU_FORMAT_ATTR but allowing for format_attr to be assigned based - * on family - */ -#define AMD_FORMAT_ATTR(_dev, _name, _format) \ -static ssize_t \ -_dev##_show##_name(struct device *dev, \ - struct device_attribute *attr, \ - char *page) \ -{ \ - BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ - return sprintf(page, _format "\n"); \ -} \ -static struct device_attribute format_attr_##_dev##_name = __ATTR_RO(_dev); - -/* Used for each uncore counter type */ -#define AMD_ATTRIBUTE(_name) \ -static struct attribute *amd_uncore_format_attr_##_name[] = { \ - &format_attr_event_##_name.attr, \ - &format_attr_umask.attr, \ - NULL, \ -}; \ -static struct attribute_group amd_uncore_format_group_##_name = { \ - .name = "format", \ - .attrs = amd_uncore_format_attr_##_name, \ -}; \ -static const struct attribute_group *amd_uncore_attr_groups_##_name[] = { \ - &amd_uncore_attr_group, \ - &amd_uncore_format_group_##_name, \ - NULL, \ +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + +DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35"); +DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */ +DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */ +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(threadmask2, threadmask, "config:56-57"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */ + +static struct attribute *amd_uncore_df_format_attr[] = { + &format_attr_event12.attr, /* event14 if F17h+ */ + &format_attr_umask.attr, + NULL, +}; + +static struct attribute *amd_uncore_l3_format_attr[] = { + &format_attr_event12.attr, /* event8 if F17h+ */ + &format_attr_umask.attr, + NULL, /* slicemask if F17h, coreid if F19h */ + NULL, /* threadmask8 if F17h, enallslices if F19h */ + NULL, /* enallcores if F19h */ + NULL, /* sliceid if F19h */ + NULL, /* threadmask2 if F19h */ + NULL, +}; + +static struct attribute_group amd_uncore_df_format_group = { + .name = "format", + .attrs = amd_uncore_df_format_attr, }; -AMD_FORMAT_ATTR(event, , "config:0-7,32-35"); -AMD_FORMAT_ATTR(umask, , "config:8-15"); -AMD_FORMAT_ATTR(event, _df, "config:0-7,32-35,59-60"); -AMD_FORMAT_ATTR(event, _l3, "config:0-7"); -AMD_ATTRIBUTE(df); -AMD_ATTRIBUTE(l3); +static struct attribute_group amd_uncore_l3_format_group = { + .name = "format", + .attrs = amd_uncore_l3_format_attr, +}; + +static const struct attribute_group *amd_uncore_df_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_df_format_group, + NULL, +}; + +static const struct attribute_group *amd_uncore_l3_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_l3_format_group, + NULL, +}; static struct pmu amd_nb_pmu = { .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_df_attr_groups, + .name = "amd_nb", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -326,6 +351,8 @@ static struct pmu amd_nb_pmu = { static struct pmu amd_llc_pmu = { .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_l3_attr_groups, + .name = "amd_l2", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -529,6 +556,8 @@ static int amd_uncore_cpu_dead(unsigned int cpu) static int __init amd_uncore_init(void) { + struct attribute **df_attr = amd_uncore_df_format_attr; + struct attribute **l3_attr = amd_uncore_l3_format_attr; int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && @@ -538,6 +567,8 @@ static int __init amd_uncore_init(void) if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) return -ENODEV; + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; if (boot_cpu_data.x86 >= 0x17) { /* * For F17h and above, the Northbridge counters are @@ -545,27 +576,16 @@ static int __init amd_uncore_init(void) * counters are supported too. The PMUs are exported * based on family as either L2 or L3 and NB or DF. */ - num_counters_nb = NUM_COUNTERS_NB; num_counters_llc = NUM_COUNTERS_L3; amd_nb_pmu.name = "amd_df"; amd_llc_pmu.name = "amd_l3"; - format_attr_event_df.show = &event_show_df; - format_attr_event_l3.show = &event_show_l3; l3_mask = true; - } else { - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L2; - amd_nb_pmu.name = "amd_nb"; - amd_llc_pmu.name = "amd_l2"; - format_attr_event_df = format_attr_event; - format_attr_event_l3 = format_attr_event; - l3_mask = false; } - amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; - amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; - if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { + if (boot_cpu_data.x86 >= 0x17) + *df_attr = &format_attr_event14.attr; + amd_uncore_nb = alloc_percpu(struct amd_uncore *); if (!amd_uncore_nb) { ret = -ENOMEM; @@ -575,13 +595,29 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; - pr_info("%s NB counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + pr_info("%d %s %s counters detected\n", num_counters_nb, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_nb_pmu.name); + ret = 0; } if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { + if (boot_cpu_data.x86 >= 0x19) { + *l3_attr++ = &format_attr_event8.attr; + *l3_attr++ = &format_attr_umask.attr; + *l3_attr++ = &format_attr_coreid.attr; + *l3_attr++ = &format_attr_enallslices.attr; + *l3_attr++ = &format_attr_enallcores.attr; + *l3_attr++ = &format_attr_sliceid.attr; + *l3_attr++ = &format_attr_threadmask2.attr; + } else if (boot_cpu_data.x86 >= 0x17) { + *l3_attr++ = &format_attr_event8.attr; + *l3_attr++ = &format_attr_umask.attr; + *l3_attr++ = &format_attr_slicemask.attr; + *l3_attr++ = &format_attr_threadmask8.attr; + } + amd_uncore_llc = alloc_percpu(struct amd_uncore *); if (!amd_uncore_llc) { ret = -ENOMEM; @@ -591,9 +627,9 @@ static int __init amd_uncore_init(void) if (ret) goto fail_llc; - pr_info("%s LLC counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + pr_info("%d %s %s counters detected\n", num_counters_llc, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_llc_pmu.name); ret = 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 360c395d51d0..a88c94d65693 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -105,6 +105,9 @@ u64 x86_perf_event_update(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; + if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) + return x86_pmu.update_topdown_event(event); + /* * Careful: an NMI might modify the previous event value. * @@ -1056,6 +1059,45 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) return unsched ? -EINVAL : 0; } +static int add_nr_metric_event(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (is_metric_event(event)) { + if (cpuc->n_metric == INTEL_TD_METRIC_NUM) + return -EINVAL; + cpuc->n_metric++; + cpuc->n_txn_metric++; + } + + return 0; +} + +static void del_nr_metric_event(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (is_metric_event(event)) + cpuc->n_metric--; +} + +static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, + int max_count, int n) +{ + + if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) + return -EINVAL; + + if (n >= max_count + cpuc->n_metric) + return -EINVAL; + + cpuc->event_list[n] = event; + if (is_counter_pair(&event->hw)) { + cpuc->n_pair++; + cpuc->n_txn_pair++; + } + + return 0; +} + /* * dogrp: true if must collect siblings events (group) * returns total number of events and error code @@ -1092,28 +1134,22 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, } if (is_x86_event(leader)) { - if (n >= max_count) + if (collect_event(cpuc, leader, max_count, n)) return -EINVAL; - cpuc->event_list[n] = leader; n++; - if (is_counter_pair(&leader->hw)) - cpuc->n_pair++; } + if (!dogrp) return n; for_each_sibling_event(event, leader) { - if (!is_x86_event(event) || - event->state <= PERF_EVENT_STATE_OFF) + if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; - if (n >= max_count) + if (collect_event(cpuc, event, max_count, n)) return -EINVAL; - cpuc->event_list[n] = event; n++; - if (is_counter_pair(&event->hw)) - cpuc->n_pair++; } return n; } @@ -1135,11 +1171,16 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->event_base = 0; break; + case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: + /* All the metric events are mapped onto the fixed counter 3. */ + idx = INTEL_PMC_IDX_FIXED_SLOTS; + /* fall through */ case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (idx - INTEL_PMC_IDX_FIXED); - hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1<<30; + hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | + INTEL_PMC_FIXED_RDPMC_BASE; break; default: @@ -1270,6 +1311,10 @@ int x86_perf_event_set_period(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; + if (unlikely(is_topdown_count(event)) && + x86_pmu.set_topdown_event_period) + return x86_pmu.set_topdown_event_period(event); + /* * If we are way outside a reasonable range then just skip forward: */ @@ -1309,11 +1354,11 @@ int x86_perf_event_set_period(struct perf_event *event) wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* - * Clear the Merge event counter's upper 16 bits since + * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) - wrmsrl(x86_pmu_event_addr(idx + 1), 0); + wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); /* * Due to erratum on certan cpu we need @@ -1551,6 +1596,8 @@ static void x86_pmu_del(struct perf_event *event, int flags) } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; + if (x86_pmu.intel_cap.perf_metrics) + del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); @@ -2018,6 +2065,8 @@ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) perf_pmu_disable(pmu); __this_cpu_write(cpu_hw_events.n_txn, 0); + __this_cpu_write(cpu_hw_events.n_txn_pair, 0); + __this_cpu_write(cpu_hw_events.n_txn_metric, 0); } /* @@ -2043,6 +2092,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); + __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); + __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); perf_pmu_enable(pmu); } @@ -2264,17 +2315,15 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m static int x86_pmu_event_idx(struct perf_event *event) { - int idx = event->hw.idx; + struct hw_perf_event *hwc = &event->hw; - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return 0; - if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { - idx -= INTEL_PMC_IDX_FIXED; - idx |= 1 << 30; - } - - return idx + 1; + if (is_metric_idx(hwc->idx)) + return INTEL_PMC_FIXED_RDPMC_METRICS + 1; + else + return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 31e6887d24f1..f1926e9f2143 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -243,10 +243,14 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { static struct event_constraint intel_icl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x1c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf), INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */ @@ -309,6 +313,12 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles, EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale, "4", "2"); +EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4"); +EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81"); +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83"); + static struct attribute *snb_events_attrs[] = { EVENT_PTR(td_slots_issued), EVENT_PTR(td_slots_retired), @@ -2165,11 +2175,24 @@ static inline void intel_clear_masks(struct perf_event *event, int idx) static void intel_pmu_disable_fixed(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; + int idx = hwc->idx; - mask = 0xfULL << (idx * 4); + if (is_topdown_idx(idx)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * When there are other active TopDown events, + * don't disable the fixed counter 3. + */ + if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx)) + return; + idx = INTEL_PMC_IDX_FIXED_SLOTS; + } + intel_clear_masks(event, idx); + + mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4); rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; wrmsrl(hwc->config_base, ctrl_val); @@ -2180,17 +2203,28 @@ static void intel_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - if (idx < INTEL_PMC_IDX_FIXED) { + switch (idx) { + case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_clear_masks(event, idx); x86_pmu_disable_event(event); - } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { - intel_clear_masks(event, idx); + break; + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_disable_fixed(event); - } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { + break; + case INTEL_PMC_IDX_FIXED_BTS: intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); - } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + return; + case INTEL_PMC_IDX_FIXED_VLBR: intel_clear_masks(event, idx); + break; + default: + intel_clear_masks(event, idx); + pr_warn("Failed to disable the event with invalid index %d\n", + idx); + return; + } /* * Needs to be called after x86_pmu_disable_event, @@ -2208,10 +2242,189 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_pebs_del(event); } +static int icl_set_topdown_event_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); + + /* + * The values in PERF_METRICS MSR are derived from fixed counter 3. + * Software should start both registers, PERF_METRICS and fixed + * counter 3, from zero. + * Clear PERF_METRICS and Fixed counter 3 in initialization. + * After that, both MSRs will be cleared for each read. + * Don't need to clear them again. + */ + if (left == x86_pmu.max_period) { + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); + wrmsrl(MSR_PERF_METRICS, 0); + hwc->saved_slots = 0; + hwc->saved_metric = 0; + } + + if ((hwc->saved_slots) && is_slots_event(event)) { + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots); + wrmsrl(MSR_PERF_METRICS, hwc->saved_metric); + } + + perf_event_update_userpage(event); + + return 0; +} + +static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) +{ + u32 val; + + /* + * The metric is reported as an 8bit integer fraction + * suming up to 0xff. + * slots-in-metric = (Metric / 0xff) * slots + */ + val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff; + return mul_u64_u32_div(slots, val, 0xff); +} + +static u64 icl_get_topdown_value(struct perf_event *event, + u64 slots, u64 metrics) +{ + int idx = event->hw.idx; + u64 delta; + + if (is_metric_idx(idx)) + delta = icl_get_metrics_event_value(metrics, slots, idx); + else + delta = slots; + + return delta; +} + +static void __icl_update_topdown_event(struct perf_event *event, + u64 slots, u64 metrics, + u64 last_slots, u64 last_metrics) +{ + u64 delta, last = 0; + + delta = icl_get_topdown_value(event, slots, metrics); + if (last_slots) + last = icl_get_topdown_value(event, last_slots, last_metrics); + + /* + * The 8bit integer fraction of metric may be not accurate, + * especially when the changes is very small. + * For example, if only a few bad_spec happens, the fraction + * may be reduced from 1 to 0. If so, the bad_spec event value + * will be 0 which is definitely less than the last value. + * Avoid update event->count for this case. + */ + if (delta > last) { + delta -= last; + local64_add(delta, &event->count); + } +} + +static void update_saved_topdown_regs(struct perf_event *event, + u64 slots, u64 metrics) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *other; + int idx; + + event->hw.saved_slots = slots; + event->hw.saved_metric = metrics; + + for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + if (!is_topdown_idx(idx)) + continue; + other = cpuc->events[idx]; + other->hw.saved_slots = slots; + other->hw.saved_metric = metrics; + } +} + +/* + * Update all active Topdown events. + * + * The PERF_METRICS and Fixed counter 3 are read separately. The values may be + * modify by a NMI. PMU has to be disabled before calling this function. + */ +static u64 icl_update_topdown_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *other; + u64 slots, metrics; + bool reset = true; + int idx; + + /* read Fixed counter 3 */ + rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); + if (!slots) + return 0; + + /* read PERF_METRICS */ + rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + + for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) { + if (!is_topdown_idx(idx)) + continue; + other = cpuc->events[idx]; + __icl_update_topdown_event(other, slots, metrics, + event ? event->hw.saved_slots : 0, + event ? event->hw.saved_metric : 0); + } + + /* + * Check and update this event, which may have been cleared + * in active_mask e.g. x86_pmu_stop() + */ + if (event && !test_bit(event->hw.idx, cpuc->active_mask)) { + __icl_update_topdown_event(event, slots, metrics, + event->hw.saved_slots, + event->hw.saved_metric); + + /* + * In x86_pmu_stop(), the event is cleared in active_mask first, + * then drain the delta, which indicates context switch for + * counting. + * Save metric and slots for context switch. + * Don't need to reset the PERF_METRICS and Fixed counter 3. + * Because the values will be restored in next schedule in. + */ + update_saved_topdown_regs(event, slots, metrics); + reset = false; + } + + if (reset) { + /* The fixed counter 3 has to be written before the PERF_METRICS. */ + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); + wrmsrl(MSR_PERF_METRICS, 0); + if (event) + update_saved_topdown_regs(event, 0, 0); + } + + return slots; +} + +static void intel_pmu_read_topdown_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* Only need to call update_topdown_event() once for group read. */ + if ((cpuc->txn_flags & PERF_PMU_TXN_READ) && + !is_slots_event(event)) + return; + + perf_pmu_disable(event->pmu); + x86_pmu.update_topdown_event(event); + perf_pmu_enable(event->pmu); +} + static void intel_pmu_read_event(struct perf_event *event) { if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_auto_reload_read(event); + else if (is_topdown_count(event) && x86_pmu.update_topdown_event) + intel_pmu_read_topdown_event(event); else x86_perf_event_update(event); } @@ -2219,8 +2432,22 @@ static void intel_pmu_read_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask, bits = 0; + int idx = hwc->idx; + + if (is_topdown_idx(idx)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + /* + * When there are other active TopDown events, + * don't enable the fixed counter 3 again. + */ + if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx)) + return; + + idx = INTEL_PMC_IDX_FIXED_SLOTS; + } + + intel_set_masks(event, idx); /* * Enable IRQ generation (0x8), if not PEBS, @@ -2240,6 +2467,7 @@ static void intel_pmu_enable_fixed(struct perf_event *event) if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) bits |= 0x4; + idx -= INTEL_PMC_IDX_FIXED; bits <<= (idx * 4); mask = 0xfULL << (idx * 4); @@ -2262,18 +2490,27 @@ static void intel_pmu_enable_event(struct perf_event *event) if (unlikely(event->attr.precise_ip)) intel_pmu_pebs_enable(event); - if (idx < INTEL_PMC_IDX_FIXED) { + switch (idx) { + case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_set_masks(event, idx); __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); - } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { - intel_set_masks(event, idx); + break; + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); - } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { + break; + case INTEL_PMC_IDX_FIXED_BTS: if (!__this_cpu_read(cpu_hw_events.enabled)) return; intel_pmu_enable_bts(hwc->config); - } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + break; + case INTEL_PMC_IDX_FIXED_VLBR: intel_set_masks(event, idx); + break; + default: + pr_warn("Failed to enable the event with invalid index %d\n", + idx); + } } static void intel_pmu_add_event(struct perf_event *event) @@ -2389,7 +2626,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) /* * PEBS overflow sets bit 62 in the global status register */ - if (__test_and_clear_bit(62, (unsigned long *)&status)) { + if (__test_and_clear_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsigned long *)&status)) { u64 pebs_enabled = cpuc->pebs_enabled; handled++; @@ -2410,7 +2647,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) /* * Intel PT */ - if (__test_and_clear_bit(55, (unsigned long *)&status)) { + if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && perf_guest_cbs->handle_intel_pt_intr)) @@ -2420,6 +2657,15 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) } /* + * Intel Perf mertrics + */ + if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { + handled++; + if (x86_pmu.update_topdown_event) + x86_pmu.update_topdown_event(NULL); + } + + /* * Checkpointed counters can lead to 'spurious' PMIs because the * rollback caused by the PMI will have cleared the overflow status * bit. Therefore always force probe these counters. @@ -3355,6 +3601,56 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.type != PERF_TYPE_RAW) return 0; + /* + * Config Topdown slots and metric events + * + * The slots event on Fixed Counter 3 can support sampling, + * which will be handled normally in x86_perf_event_update(). + * + * Metric events don't support sampling and require being paired + * with a slots event as group leader. When the slots event + * is used in a metrics group, it too cannot support sampling. + */ + if (x86_pmu.intel_cap.perf_metrics && is_topdown_event(event)) { + if (event->attr.config1 || event->attr.config2) + return -EINVAL; + + /* + * The TopDown metrics events and slots event don't + * support any filters. + */ + if (event->attr.config & X86_ALL_EVENT_FLAGS) + return -EINVAL; + + if (is_metric_event(event)) { + struct perf_event *leader = event->group_leader; + + /* The metric events don't support sampling. */ + if (is_sampling_event(event)) + return -EINVAL; + + /* The metric events require a slots group leader. */ + if (!is_slots_event(leader)) + return -EINVAL; + + /* + * The leader/SLOTS must not be a sampling event for + * metric use; hardware requires it starts at 0 when used + * in conjunction with MSR_PERF_METRICS. + */ + if (is_sampling_event(leader)) + return -EINVAL; + + event->event_caps |= PERF_EV_CAP_SIBLING; + /* + * Only once we have a METRICs sibling do we + * need TopDown magic. + */ + leader->hw.flags |= PERF_X86_EVENT_TOPDOWN; + event->hw.flags |= PERF_X86_EVENT_TOPDOWN; + } + } + if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) return 0; @@ -3787,6 +4083,17 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.counter_freezing) enable_counter_freeze(); + /* Disable perf metrics if any added CPU doesn't support it. */ + if (x86_pmu.intel_cap.perf_metrics) { + union perf_capabilities perf_cap; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); + if (!perf_cap.perf_metrics) { + x86_pmu.intel_cap.perf_metrics = 0; + x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + } + } + if (!cpuc->shared_regs) return; @@ -4355,6 +4662,15 @@ static struct attribute *icl_events_attrs[] = { NULL, }; +static struct attribute *icl_td_events_attrs[] = { + EVENT_PTR(slots), + EVENT_PTR(td_retiring), + EVENT_PTR(td_bad_spec), + EVENT_PTR(td_fe_bound), + EVENT_PTR(td_be_bound), + NULL, +}; + static struct attribute *icl_tsx_events_attrs[] = { EVENT_PTR(tx_start), EVENT_PTR(tx_abort), @@ -4830,6 +5146,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_TREMONT_D: case INTEL_FAM6_ATOM_TREMONT: + case INTEL_FAM6_ATOM_TREMONT_L: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -5139,10 +5456,13 @@ __init int intel_pmu_init(void) hsw_format_attr : nhm_format_attr; extra_skl_attr = skl_format_attr; mem_attr = icl_events_attrs; + td_attr = icl_td_events_attrs; tsx_attr = icl_tsx_events_attrs; x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); + x86_pmu.update_topdown_event = icl_update_topdown_event; + x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; pr_cont("Icelake events, "); name = "icelake"; break; @@ -5198,6 +5518,15 @@ __init int intel_pmu_init(void) * counter, so do not extend mask to generic counters */ for_each_event_constraint(c, x86_pmu.event_constraints) { + /* + * Don't extend the topdown slots and metrics + * events to the generic counters. + */ + if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { + c->weight = hweight64(c->idxmsk64); + continue; + } + if (c->cmask == FIXED_EVENT_FLAGS && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) { c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; @@ -5253,6 +5582,9 @@ __init int intel_pmu_init(void) if (x86_pmu.counter_freezing) x86_pmu.handle_irq = intel_pmu_handle_irq_v4; + if (x86_pmu.intel_cap.perf_metrics) + x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + return 0; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 86848c57b55e..404315df1e16 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -670,9 +670,7 @@ unlock: static inline void intel_pmu_drain_pebs_buffer(void) { - struct pt_regs regs; - - x86_pmu.drain_pebs(®s); + x86_pmu.drain_pebs(NULL); } /* @@ -1737,6 +1735,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; void *at = get_next_pebs_record_by_bit(base, top, bit); + struct pt_regs dummy_iregs; if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { /* @@ -1749,6 +1748,9 @@ static void __intel_pmu_pebs_event(struct perf_event *event, } else if (!intel_pmu_save_and_restart(event)) return; + if (!iregs) + iregs = &dummy_iregs; + while (count > 1) { setup_sample(event, iregs, at, &data, regs); perf_event_output(event, &data, regs); @@ -1758,16 +1760,22 @@ static void __intel_pmu_pebs_event(struct perf_event *event, } setup_sample(event, iregs, at, &data, regs); - - /* - * All but the last records are processed. - * The last one is left to be able to call the overflow handler. - */ - if (perf_event_overflow(event, &data, regs)) { - x86_pmu_stop(event, 0); - return; + if (iregs == &dummy_iregs) { + /* + * The PEBS records may be drained in the non-overflow context, + * e.g., large PEBS + context switch. Perf should treat the + * last record the same as other PEBS records, and doesn't + * invoke the generic overflow handler. + */ + perf_event_output(event, &data, regs); + } else { + /* + * All but the last records are processed. + * The last one is left to be able to call the overflow handler. + */ + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); } - } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index d5c6d3b340c5..86d012b3e0b4 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -12,6 +12,8 @@ struct intel_uncore_type **uncore_mmio_uncores = empty_uncore; static bool pcidrv_registered; struct pci_driver *uncore_pci_driver; +/* The PCI driver for the device which the uncore doesn't own. */ +struct pci_driver *uncore_pci_sub_driver; /* pci bus to socket mapping */ DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); @@ -989,65 +991,71 @@ uncore_types_init(struct intel_uncore_type **types, bool setid) } /* - * add a pci uncore device + * Get the die information of a PCI device. + * @pdev: The PCI device. + * @phys_id: The physical socket id which the device maps to. + * @die: The die id which the device maps to. */ -static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, + int *phys_id, int *die) { - struct intel_uncore_type *type; - struct intel_uncore_pmu *pmu = NULL; - struct intel_uncore_box *box; - int phys_id, die, ret; - - phys_id = uncore_pcibus_to_physid(pdev->bus); - if (phys_id < 0) + *phys_id = uncore_pcibus_to_physid(pdev->bus); + if (*phys_id < 0) return -ENODEV; - die = (topology_max_die_per_package() > 1) ? phys_id : - topology_phys_to_logical_pkg(phys_id); - if (die < 0) + *die = (topology_max_die_per_package() > 1) ? *phys_id : + topology_phys_to_logical_pkg(*phys_id); + if (*die < 0) return -EINVAL; - if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { - int idx = UNCORE_PCI_DEV_IDX(id->driver_data); - - uncore_extra_pci_dev[die].dev[idx] = pdev; - pci_set_drvdata(pdev, NULL); - return 0; - } - - type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + return 0; +} - /* - * Some platforms, e.g. Knights Landing, use a common PCI device ID - * for multiple instances of an uncore PMU device type. We should check - * PCI slot and func to indicate the uncore box. - */ - if (id->driver_data & ~0xffff) { - struct pci_driver *pci_drv = pdev->driver; - const struct pci_device_id *ids = pci_drv->id_table; - unsigned int devfn; - - while (ids && ids->vendor) { - if ((ids->vendor == pdev->vendor) && - (ids->device == pdev->device)) { - devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data), - UNCORE_PCI_DEV_FUNC(ids->driver_data)); - if (devfn == pdev->devfn) { - pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; - break; - } +/* + * Find the PMU of a PCI device. + * @pdev: The PCI device. + * @ids: The ID table of the available PCI devices with a PMU. + */ +static struct intel_uncore_pmu * +uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) +{ + struct intel_uncore_pmu *pmu = NULL; + struct intel_uncore_type *type; + kernel_ulong_t data; + unsigned int devfn; + + while (ids && ids->vendor) { + if ((ids->vendor == pdev->vendor) && + (ids->device == pdev->device)) { + data = ids->driver_data; + devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(data), + UNCORE_PCI_DEV_FUNC(data)); + if (devfn == pdev->devfn) { + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(data)]; + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(data)]; + break; } - ids++; } - if (pmu == NULL) - return -ENODEV; - } else { - /* - * for performance monitoring unit with multiple boxes, - * each box has a different function id. - */ - pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + ids++; } + return pmu; +} + +/* + * Register the PMU for a PCI device + * @pdev: The PCI device. + * @type: The corresponding PMU type of the device. + * @pmu: The corresponding PMU of the device. + * @phys_id: The physical socket id which the device maps to. + * @die: The die id which the device maps to. + */ +static int uncore_pci_pmu_register(struct pci_dev *pdev, + struct intel_uncore_type *type, + struct intel_uncore_pmu *pmu, + int phys_id, int die) +{ + struct intel_uncore_box *box; + int ret; if (WARN_ON_ONCE(pmu->boxes[die] != NULL)) return -EINVAL; @@ -1067,7 +1075,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id box->pci_dev = pdev; box->pmu = pmu; uncore_box_init(box); - pci_set_drvdata(pdev, box); pmu->boxes[die] = box; if (atomic_inc_return(&pmu->activeboxes) > 1) @@ -1076,7 +1083,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id /* First active box registers the pmu */ ret = uncore_pmu_register(pmu); if (ret) { - pci_set_drvdata(pdev, NULL); pmu->boxes[die] = NULL; uncore_box_exit(box); kfree(box); @@ -1084,18 +1090,87 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id return ret; } +/* + * add a pci uncore device + */ +static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu = NULL; + int phys_id, die, ret; + + ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die); + if (ret) + return ret; + + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { + int idx = UNCORE_PCI_DEV_IDX(id->driver_data); + + uncore_extra_pci_dev[die].dev[idx] = pdev; + pci_set_drvdata(pdev, NULL); + return 0; + } + + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + + /* + * Some platforms, e.g. Knights Landing, use a common PCI device ID + * for multiple instances of an uncore PMU device type. We should check + * PCI slot and func to indicate the uncore box. + */ + if (id->driver_data & ~0xffff) { + struct pci_driver *pci_drv = pdev->driver; + + pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table); + if (pmu == NULL) + return -ENODEV; + } else { + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + } + + ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die); + + pci_set_drvdata(pdev, pmu->boxes[die]); + + return ret; +} + +/* + * Unregister the PMU of a PCI device + * @pmu: The corresponding PMU is unregistered. + * @phys_id: The physical socket id which the device maps to. + * @die: The die id which the device maps to. + */ +static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, + int phys_id, int die) +{ + struct intel_uncore_box *box = pmu->boxes[die]; + + if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) + return; + + pmu->boxes[die] = NULL; + if (atomic_dec_return(&pmu->activeboxes) == 0) + uncore_pmu_unregister(pmu); + uncore_box_exit(box); + kfree(box); +} + static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box; struct intel_uncore_pmu *pmu; int i, phys_id, die; - phys_id = uncore_pcibus_to_physid(pdev->bus); + if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + return; box = pci_get_drvdata(pdev); if (!box) { - die = (topology_max_die_per_package() > 1) ? phys_id : - topology_phys_to_logical_pkg(phys_id); for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { if (uncore_extra_pci_dev[die].dev[i] == pdev) { uncore_extra_pci_dev[die].dev[i] = NULL; @@ -1107,15 +1182,84 @@ static void uncore_pci_remove(struct pci_dev *pdev) } pmu = box->pmu; - if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) - return; pci_set_drvdata(pdev, NULL); - pmu->boxes[box->dieid] = NULL; - if (atomic_dec_return(&pmu->activeboxes) == 0) - uncore_pmu_unregister(pmu); - uncore_box_exit(box); - kfree(box); + + uncore_pci_pmu_unregister(pmu, phys_id, die); +} + +static int uncore_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + struct intel_uncore_pmu *pmu; + int phys_id, die; + + /* Unregister the PMU when the device is going to be deleted. */ + if (action != BUS_NOTIFY_DEL_DEVICE) + return NOTIFY_DONE; + + pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table); + if (!pmu) + return NOTIFY_DONE; + + if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die)) + return NOTIFY_DONE; + + uncore_pci_pmu_unregister(pmu, phys_id, die); + + return NOTIFY_OK; +} + +static struct notifier_block uncore_notifier = { + .notifier_call = uncore_bus_notify, +}; + +static void uncore_pci_sub_driver_init(void) +{ + const struct pci_device_id *ids = uncore_pci_sub_driver->id_table; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct pci_dev *pci_sub_dev; + bool notify = false; + unsigned int devfn; + int phys_id, die; + + while (ids && ids->vendor) { + pci_sub_dev = NULL; + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(ids->driver_data)]; + /* + * Search the available device, and register the + * corresponding PMU. + */ + while ((pci_sub_dev = pci_get_device(PCI_VENDOR_ID_INTEL, + ids->device, pci_sub_dev))) { + devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data), + UNCORE_PCI_DEV_FUNC(ids->driver_data)); + if (devfn != pci_sub_dev->devfn) + continue; + + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; + if (!pmu) + continue; + + if (uncore_pci_get_dev_die_info(pci_sub_dev, + &phys_id, &die)) + continue; + + if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu, + phys_id, die)) + notify = true; + } + ids++; + } + + if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier)) + notify = false; + + if (!notify) + uncore_pci_sub_driver = NULL; } static int __init uncore_pci_init(void) @@ -1141,6 +1285,9 @@ static int __init uncore_pci_init(void) if (ret) goto errtype; + if (uncore_pci_sub_driver) + uncore_pci_sub_driver_init(); + pcidrv_registered = true; return 0; @@ -1158,6 +1305,8 @@ static void uncore_pci_exit(void) { if (pcidrv_registered) { pcidrv_registered = false; + if (uncore_pci_sub_driver) + bus_unregister_notifier(&pci_bus_type, &uncore_notifier); pci_unregister_driver(uncore_pci_driver); uncore_types_exit(uncore_pci_uncores); kfree(uncore_extra_pci_dev); @@ -1478,12 +1627,12 @@ static const struct intel_uncore_init_fun icl_uncore_init __initconst = { }; static const struct intel_uncore_init_fun tgl_uncore_init __initconst = { - .cpu_init = icl_uncore_cpu_init, + .cpu_init = tgl_uncore_cpu_init, .mmio_init = tgl_uncore_mmio_init, }; static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { - .cpu_init = icl_uncore_cpu_init, + .cpu_init = tgl_uncore_cpu_init, .mmio_init = tgl_l_uncore_mmio_init, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 105fdc69825e..83d2a7d490e0 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -552,6 +552,7 @@ extern struct intel_uncore_type **uncore_msr_uncores; extern struct intel_uncore_type **uncore_pci_uncores; extern struct intel_uncore_type **uncore_mmio_uncores; extern struct pci_driver *uncore_pci_driver; +extern struct pci_driver *uncore_pci_sub_driver; extern raw_spinlock_t pci2phy_map_lock; extern struct list_head pci2phy_map_head; extern struct pci_extra_dev *uncore_extra_pci_dev; @@ -567,6 +568,7 @@ void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); +void tgl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); int snb_pci2phy_map_init(int devid); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 6a4ca27b2c9e..39e632ed6ca9 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -126,6 +126,10 @@ #define ICL_UNC_CBO_0_PER_CTR0 0x702 #define ICL_UNC_CBO_MSR_OFFSET 0x8 +/* ICL ARB register */ +#define ICL_UNC_ARB_PER_CTR 0x3b1 +#define ICL_UNC_ARB_PERFEVTSEL 0x3b3 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); @@ -313,15 +317,21 @@ void skl_uncore_cpu_init(void) snb_uncore_arb.ops = &skl_uncore_msr_ops; } +static struct intel_uncore_ops icl_uncore_msr_ops = { + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + static struct intel_uncore_type icl_uncore_cbox = { .name = "cbox", - .num_counters = 4, + .num_counters = 2, .perf_ctr_bits = 44, .perf_ctr = ICL_UNC_CBO_0_PER_CTR0, .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, .event_mask = SNB_UNC_RAW_EVENT_MASK, .msr_offset = ICL_UNC_CBO_MSR_OFFSET, - .ops = &skl_uncore_msr_ops, + .ops = &icl_uncore_msr_ops, .format_group = &snb_uncore_format_group, }; @@ -350,13 +360,25 @@ static struct intel_uncore_type icl_uncore_clockbox = { .single_fixed = 1, .event_mask = SNB_UNC_CTL_EV_SEL_MASK, .format_group = &icl_uncore_clock_format_group, - .ops = &skl_uncore_msr_ops, + .ops = &icl_uncore_msr_ops, .event_descs = icl_uncore_events, }; +static struct intel_uncore_type icl_uncore_arb = { + .name = "arb", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 44, + .perf_ctr = ICL_UNC_ARB_PER_CTR, + .event_ctl = ICL_UNC_ARB_PERFEVTSEL, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .ops = &icl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + static struct intel_uncore_type *icl_msr_uncores[] = { &icl_uncore_cbox, - &snb_uncore_arb, + &icl_uncore_arb, &icl_uncore_clockbox, NULL, }; @@ -374,6 +396,21 @@ void icl_uncore_cpu_init(void) { uncore_msr_uncores = icl_msr_uncores; icl_uncore_cbox.num_boxes = icl_get_cbox_num(); +} + +static struct intel_uncore_type *tgl_msr_uncores[] = { + &icl_uncore_cbox, + &snb_uncore_arb, + &icl_uncore_clockbox, + NULL, +}; + +void tgl_uncore_cpu_init(void) +{ + uncore_msr_uncores = tgl_msr_uncores; + icl_uncore_cbox.num_boxes = icl_get_cbox_num(); + icl_uncore_cbox.ops = &skl_uncore_msr_ops; + icl_uncore_clockbox.ops = &skl_uncore_msr_ops; snb_uncore_arb.ops = &skl_uncore_msr_ops; } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 62e88ad919ff..7bdb1821215d 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -393,6 +393,11 @@ #define SNR_M2M_PCI_PMON_BOX_CTL 0x438 #define SNR_M2M_PCI_PMON_UMASK_EXT 0xff +/* SNR PCIE3 */ +#define SNR_PCIE3_PCI_PMON_CTL0 0x508 +#define SNR_PCIE3_PCI_PMON_CTR0 0x4e8 +#define SNR_PCIE3_PCI_PMON_BOX_CTL 0x4e0 + /* SNR IMC */ #define SNR_IMC_MMIO_PMON_FIXED_CTL 0x54 #define SNR_IMC_MMIO_PMON_FIXED_CTR 0x38 @@ -3749,7 +3754,9 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) ret = skx_iio_get_topology(type); if (ret) - return ret; + goto clear_attr_update; + + ret = -ENOMEM; /* One more for NULL. */ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); @@ -3781,8 +3788,9 @@ err: kfree(eas); kfree(attrs); kfree(type->topology); +clear_attr_update: type->attr_update = NULL; - return -ENOMEM; + return ret; } static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) @@ -4551,12 +4559,46 @@ static struct intel_uncore_type snr_uncore_m2m = { .format_group = &snr_m2m_uncore_format_group, }; +static void snr_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, (u32)(hwc->config | SNBEP_PMON_CTL_EN)); + pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32)); +} + +static struct intel_uncore_ops snr_pcie3_uncore_pci_ops = { + .init_box = snr_m2m_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snr_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct intel_uncore_type snr_uncore_pcie3 = { + .name = "pcie3", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_PCIE3_PCI_PMON_CTR0, + .event_ctl = SNR_PCIE3_PCI_PMON_CTL0, + .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT, + .box_ctl = SNR_PCIE3_PCI_PMON_BOX_CTL, + .ops = &snr_pcie3_uncore_pci_ops, + .format_group = &skx_uncore_iio_format_group, +}; + enum { SNR_PCI_UNCORE_M2M, + SNR_PCI_UNCORE_PCIE3, }; static struct intel_uncore_type *snr_pci_uncores[] = { [SNR_PCI_UNCORE_M2M] = &snr_uncore_m2m, + [SNR_PCI_UNCORE_PCIE3] = &snr_uncore_pcie3, NULL, }; @@ -4573,6 +4615,19 @@ static struct pci_driver snr_uncore_pci_driver = { .id_table = snr_uncore_pci_ids, }; +static const struct pci_device_id snr_uncore_pci_sub_ids[] = { + { /* PCIe3 RP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x334a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 0, SNR_PCI_UNCORE_PCIE3, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snr_uncore_pci_sub_driver = { + .name = "snr_uncore_sub", + .id_table = snr_uncore_pci_sub_ids, +}; + int snr_uncore_pci_init(void) { /* SNR UBOX DID */ @@ -4584,6 +4639,7 @@ int snr_uncore_pci_init(void) uncore_pci_uncores = snr_pci_uncores; uncore_pci_driver = &snr_uncore_pci_driver; + uncore_pci_sub_driver = &snr_uncore_pci_sub_driver; return 0; } @@ -4751,10 +4807,10 @@ static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -5212,17 +5268,17 @@ static struct uncore_event_desc icx_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(ddrt_read, "event=0xff,umask=0x30"), - INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(ddrt_write, "event=0xff,umask=0x31"), - INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(ddrt_write.unit, "MiB"), { /* end: all zeroes */ }, }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index a949f6f55991..4be8f9cabd07 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -78,6 +78,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_TREMONT_D: case INTEL_FAM6_ATOM_TREMONT: + case INTEL_FAM6_ATOM_TREMONT_L: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7b68ab5f19e7..ee2b9b9fc2a5 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -79,6 +79,31 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ #define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ #define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ + +static inline bool is_topdown_count(struct perf_event *event) +{ + return event->hw.flags & PERF_X86_EVENT_TOPDOWN; +} + +static inline bool is_metric_event(struct perf_event *event) +{ + u64 config = event->attr.config; + + return ((config & ARCH_PERFMON_EVENTSEL_EVENT) == 0) && + ((config & INTEL_ARCH_EVENT_MASK) >= INTEL_TD_METRIC_RETIRING) && + ((config & INTEL_ARCH_EVENT_MASK) <= INTEL_TD_METRIC_MAX); +} + +static inline bool is_slots_event(struct perf_event *event) +{ + return (event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_TD_SLOTS; +} + +static inline bool is_topdown_event(struct perf_event *event) +{ + return is_metric_event(event) || is_slots_event(event); +} struct amd_nb { int nb_id; /* NorthBridge id */ @@ -210,6 +235,8 @@ struct cpu_hw_events { they've never been enabled yet */ int n_txn; /* the # last events in the below arrays; added in the current transaction */ + int n_txn_pair; + int n_txn_metric; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; @@ -285,6 +312,12 @@ struct cpu_hw_events { u64 tfa_shadow; /* + * Perf Metrics + */ + /* number of accepted metrics events */ + int n_metric; + + /* * AMD specific bits */ struct amd_nb *amd_nb; @@ -376,6 +409,19 @@ struct cpu_hw_events { EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS) /* + * The special metric counters do not actually exist. They are calculated from + * the combination of the FxCtr3 + MSR_PERF_METRICS. + * + * The special metric counters are mapped to a dummy offset for the scheduler. + * The sharing between multiple users of the same metric without multiplexing + * is not allowed, even though the hardware supports that in principle. + */ + +#define METRIC_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (INTEL_PMC_IDX_METRIC_BASE + n)), \ + INTEL_ARCH_EVENT_MASK) + +/* * Constraint on the Event code + UMask */ #define INTEL_UEVENT_CONSTRAINT(c, n) \ @@ -537,7 +583,7 @@ union perf_capabilities { */ u64 full_width_write:1; u64 pebs_baseline:1; - u64 pebs_metrics_available:1; + u64 perf_metrics:1; u64 pebs_output_pt_available:1; }; u64 capabilities; @@ -727,6 +773,12 @@ struct x86_pmu { atomic_t lbr_exclusive[x86_lbr_exclusive_max]; /* + * Intel perf metrics + */ + u64 (*update_topdown_event)(struct perf_event *event); + int (*set_topdown_event_period)(struct perf_event *event); + + /* * perf task context (i.e. struct perf_event_context::task_ctx_data) * switch helper to bridge calls from perf/core to perf/x86. * See struct pmu::swap_task_ctx() usage for examples; diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 67b411f7e8c4..7c0120e2e957 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -815,6 +815,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h), + X86_MATCH_VENDOR_FAM(AMD, 0x19, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index aaddc6a9e237..c07a70ce7ffd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -467,6 +467,7 @@ #define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1) #define MSR_AMD64_IBSCTL 0xc001103a #define MSR_AMD64_IBSBRTARGET 0xc001103b +#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ #define MSR_AMD64_SEV 0xc0010131 @@ -860,11 +861,14 @@ #define MSR_CORE_PERF_FIXED_CTR0 0x00000309 #define MSR_CORE_PERF_FIXED_CTR1 0x0000030a #define MSR_CORE_PERF_FIXED_CTR2 0x0000030b +#define MSR_CORE_PERF_FIXED_CTR3 0x0000030c #define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d #define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 +#define MSR_PERF_METRICS 0x00000329 + /* PERF_GLOBAL_OVF_CTL bits */ #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT 55 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI (1ULL << MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 0c1b13720525..6960cd6d1f23 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -196,13 +196,29 @@ struct x86_pmu_capability { * Fixed-purpose performance events: */ +/* RDPMC offset for Fixed PMCs */ +#define INTEL_PMC_FIXED_RDPMC_BASE (1 << 30) +#define INTEL_PMC_FIXED_RDPMC_METRICS (1 << 29) + /* - * All 3 fixed-mode PMCs are configured via this single MSR: + * All the fixed-mode PMCs are configured via this single MSR: */ #define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d /* - * The counts are available in three separate MSRs: + * There is no event-code assigned to the fixed-mode PMCs. + * + * For a fixed-mode PMC, which has an equivalent event on a general-purpose + * PMC, the event-code of the equivalent event is used for the fixed-mode PMC, + * e.g., Instr_Retired.Any and CPU_CLK_Unhalted.Core. + * + * For a fixed-mode PMC, which doesn't have an equivalent event, a + * pseudo-encoding is used, e.g., CPU_CLK_Unhalted.Ref and TOPDOWN.SLOTS. + * The pseudo event-code for a fixed-mode PMC must be 0x00. + * The pseudo umask-code is 0xX. The X equals the index of the fixed + * counter + 1, e.g., the fixed counter 2 has the pseudo-encoding 0x0300. + * + * The counts are available in separate MSRs: */ /* Instr_Retired.Any: */ @@ -213,29 +229,84 @@ struct x86_pmu_capability { #define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a #define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1) -/* CPU_CLK_Unhalted.Ref: */ +/* CPU_CLK_Unhalted.Ref: event=0x00,umask=0x3 (pseudo-encoding) */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2) #define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES) +/* TOPDOWN.SLOTS: event=0x00,umask=0x4 (pseudo-encoding) */ +#define MSR_ARCH_PERFMON_FIXED_CTR3 0x30c +#define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3) +#define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS) + /* * We model BTS tracing as another fixed-mode PMC. * - * We choose a value in the middle of the fixed event range, since lower + * We choose the value 47 for the fixed index of BTS, since lower * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ -#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 15) + +/* + * The PERF_METRICS MSR is modeled as several magic fixed-mode PMCs, one for + * each TopDown metric event. + * + * Internally the TopDown metric events are mapped to the FxCtr 3 (SLOTS). + */ +#define INTEL_PMC_IDX_METRIC_BASE (INTEL_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_TD_RETIRING (INTEL_PMC_IDX_METRIC_BASE + 0) +#define INTEL_PMC_IDX_TD_BAD_SPEC (INTEL_PMC_IDX_METRIC_BASE + 1) +#define INTEL_PMC_IDX_TD_FE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 2) +#define INTEL_PMC_IDX_TD_BE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 3) +#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_BE_BOUND +#define INTEL_PMC_MSK_TOPDOWN ((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \ + INTEL_PMC_MSK_FIXED_SLOTS) -#define GLOBAL_STATUS_COND_CHG BIT_ULL(63) -#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62) -#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) -#define GLOBAL_STATUS_ASIF BIT_ULL(60) -#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) -#define GLOBAL_STATUS_LBRS_FROZEN_BIT 58 -#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) -#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) +/* + * There is no event-code assigned to the TopDown events. + * + * For the slots event, use the pseudo code of the fixed counter 3. + * + * For the metric events, the pseudo event-code is 0x00. + * The pseudo umask-code starts from the middle of the pseudo event + * space, 0x80. + */ +#define INTEL_TD_SLOTS 0x0400 /* TOPDOWN.SLOTS */ +/* Level 1 metrics */ +#define INTEL_TD_METRIC_RETIRING 0x8000 /* Retiring metric */ +#define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */ +#define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */ +#define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */ +#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_BE_BOUND +#define INTEL_TD_METRIC_NUM 4 + +static inline bool is_metric_idx(int idx) +{ + return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM; +} + +static inline bool is_topdown_idx(int idx) +{ + return is_metric_idx(idx) || idx == INTEL_PMC_IDX_FIXED_SLOTS; +} +#define INTEL_PMC_OTHER_TOPDOWN_BITS(bit) \ + (~(0x1ull << bit) & INTEL_PMC_MSK_TOPDOWN) + +#define GLOBAL_STATUS_COND_CHG BIT_ULL(63) +#define GLOBAL_STATUS_BUFFER_OVF_BIT 62 +#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(GLOBAL_STATUS_BUFFER_OVF_BIT) +#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) +#define GLOBAL_STATUS_ASIF BIT_ULL(60) +#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) +#define GLOBAL_STATUS_LBRS_FROZEN_BIT 58 +#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) +#define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55 +#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) +#define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 + +#define GLOBAL_CTRL_EN_PERF_METRICS 48 /* * We model guest LBR event tracing as another fixed-mode PMC like BTS. * @@ -334,6 +405,7 @@ struct pebs_xmm { #define IBS_OP_ENABLE (1ULL<<17) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +#define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ #define IBS_RIP_INVALID (1ULL<<38) #ifdef CONFIG_X86_LOCAL_APIC diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 04a49ccc7beb..0c19d279b97f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -212,17 +212,26 @@ struct hw_perf_event { */ u64 sample_period; - /* - * The period we started this sample with. - */ - u64 last_period; + union { + struct { /* Sampling */ + /* + * The period we started this sample with. + */ + u64 last_period; - /* - * However much is left of the current period; note that this is - * a full 64bit value and allows for generation of periods longer - * than hardware might allow. - */ - local64_t period_left; + /* + * However much is left of the current period; + * note that this is a full 64bit value and + * allows for generation of periods longer + * than hardware might allow. + */ + local64_t period_left; + }; + struct { /* Topdown events counting for context switch */ + u64 saved_metric; + u64 saved_slots; + }; + }; /* * State for throttling the event, see __perf_event_overflow() and @@ -576,9 +585,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. + * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and + * cannot be a group leader. If an event with this flag is detached from the + * group it is scheduled out and moved into an unrecoverable ERROR state. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) +#define PERF_EV_CAP_SIBLING BIT(2) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) @@ -859,7 +872,6 @@ struct perf_cpu_context { struct list_head cgrp_cpuctx_entry; #endif - struct list_head sched_cb_entry; int sched_cb_usage; int online; diff --git a/kernel/events/core.c b/kernel/events/core.c index e8bf92202542..da467e1dd49a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -383,7 +383,6 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -2134,8 +2133,24 @@ static inline struct list_head *get_event_list(struct perf_event *event) return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; } +/* + * Events that have PERF_EV_CAP_SIBLING require being part of a group and + * cannot exist on their own, schedule them out and move them into the ERROR + * state. Also see _perf_event_enable(), it will not be able to recover + * this ERROR state. + */ +static inline void perf_remove_sibling_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + event_sched_out(event, cpuctx, ctx); + perf_event_set_state(event, PERF_EVENT_STATE_ERROR); +} + static void perf_group_detach(struct perf_event *event) { + struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; @@ -2154,7 +2169,7 @@ static void perf_group_detach(struct perf_event *event) /* * If this is a sibling, remove it from its group. */ - if (event->group_leader != event) { + if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; goto out; @@ -2167,6 +2182,9 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + if (sibling->event_caps & PERF_EV_CAP_SIBLING) + perf_remove_sibling_event(sibling); + sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2184,10 +2202,10 @@ static void perf_group_detach(struct perf_event *event) } out: - perf_event__header_size(event->group_leader); - - for_each_sibling_event(tmp, event->group_leader) + for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); + + perf_event__header_size(leader); } static bool is_orphaned_event(struct perf_event *event) @@ -2980,6 +2998,7 @@ static void _perf_event_enable(struct perf_event *event) raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { +out: raw_spin_unlock_irq(&ctx->lock); return; } @@ -2991,8 +3010,16 @@ static void _perf_event_enable(struct perf_event *event) * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ - if (event->state == PERF_EVENT_STATE_ERROR) + if (event->state == PERF_EVENT_STATE_ERROR) { + /* + * Detached SIBLING events cannot leave ERROR state. + */ + if (event->event_caps & PERF_EV_CAP_SIBLING && + event->group_leader == event) + goto out; + event->state = PERF_EVENT_STATE_OFF; + } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); @@ -3357,10 +3384,12 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; + struct pmu *pmu; if (likely(!ctx)) return; + pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; @@ -3390,11 +3419,15 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - struct pmu *pmu = ctx->pmu; WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); + /* * PMU specific parts of task perf context can require * additional synchronization. As an example of such @@ -3406,6 +3439,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, else swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + perf_pmu_enable(pmu); + /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of @@ -3428,21 +3463,22 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); + perf_pmu_disable(pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(ctx, false); task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); + + perf_pmu_enable(pmu); raw_spin_unlock(&ctx->lock); } } -static DEFINE_PER_CPU(struct list_head, sched_cb_list); - void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - this_cpu_dec(perf_sched_cb_usages); - - if (!--cpuctx->sched_cb_usage) - list_del(&cpuctx->sched_cb_entry); + --cpuctx->sched_cb_usage; } @@ -3450,10 +3486,7 @@ void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - if (!cpuctx->sched_cb_usage++) - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); - - this_cpu_inc(perf_sched_cb_usages); + cpuctx->sched_cb_usage++; } /* @@ -3464,30 +3497,22 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void perf_pmu_sched_task(struct task_struct *prev, - struct task_struct *next, - bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) { - struct perf_cpu_context *cpuctx; struct pmu *pmu; - if (prev == next) - return; + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ - - if (WARN_ON_ONCE(!pmu->sched_task)) - continue; + if (WARN_ON_ONCE(!pmu->sched_task)) + return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(pmu); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - pmu->sched_task(cpuctx->task_ctx, sched_in); + pmu->sched_task(cpuctx->task_ctx, sched_in); - perf_pmu_enable(pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static void perf_event_switch(struct task_struct *task, @@ -3512,9 +3537,6 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; - if (__this_cpu_read(perf_sched_cb_usages)) - perf_pmu_sched_task(task, next, false); - if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); @@ -3746,10 +3768,14 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { struct perf_cpu_context *cpuctx; + struct pmu *pmu = ctx->pmu; cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) + if (cpuctx->task_ctx == ctx) { + if (cpuctx->sched_cb_usage) + __perf_pmu_sched_task(cpuctx, true); return; + } perf_ctx_lock(cpuctx, ctx); /* @@ -3759,7 +3785,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!ctx->nr_events) goto unlock; - perf_pmu_disable(ctx->pmu); + perf_pmu_disable(pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3771,7 +3797,11 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); - perf_pmu_enable(ctx->pmu); + + if (cpuctx->sched_cb_usage && pmu->sched_task) + pmu->sched_task(cpuctx->task_ctx, true); + + perf_pmu_enable(pmu); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -3814,9 +3844,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); - - if (__this_cpu_read(perf_sched_cb_usages)) - perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -5869,11 +5896,11 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); + bool detach_rest = false; if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); @@ -5904,7 +5931,8 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&event->mmap_mutex); } - atomic_dec(&rb->mmap_count); + if (atomic_dec_and_test(&rb->mmap_count)) + detach_rest = true; if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; @@ -5913,7 +5941,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) + if (!detach_rest) goto out_put; /* @@ -12829,7 +12857,6 @@ static void __init perf_event_init_all_cpus(void) #ifdef CONFIG_CGROUP_PERF INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); #endif - INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aefb6065b508..19c00ee90945 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -106,9 +106,10 @@ static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, struct module *mod) { - int len = strlen(mod->name); + int len = strlen(module_name(mod)); const char *name = trace_kprobe_symbol(tk); - return strncmp(mod->name, name, len) == 0 && name[len] == ':'; + + return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; } static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) @@ -688,7 +689,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, if (ret) pr_warn("Failed to re-register probe %s on %s: %d\n", trace_probe_name(&tk->tp), - mod->name, ret); + module_name(mod), ret); } } mutex_unlock(&event_mutex); |