From 0158b83f7508851cda9e62c4b14678c5bd492cb2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Mar 2016 15:23:46 +0100 Subject: perf/x86/ibs: Fix IBS throttle When the IBS IRQ handler get a !0 return from perf_event_overflow; meaning it should throttle the event, it only disables it, it doesn't call perf_ibs_stop(). This confuses the state machine, as we'll use pmu::start() -> perf_ibs_start() to unthrottle. Tested-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Vince Weaver Cc: dvyukov@google.com Cc: oleg@redhat.com Cc: panand@redhat.com Cc: sasha.levin@oracle.com Link: http://lkml.kernel.org/r/20160311142346.GE6344@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/events/amd/ibs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 51087c29b2c2..7956d29762ef 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -599,7 +599,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) throttle = perf_event_overflow(event, &data, ®s); out: if (throttle) - perf_ibs_disable_event(perf_ibs, hwc, *config); + perf_ibs_stop(event, 0); else perf_ibs_enable_event(perf_ibs, hwc, period >> 4); -- cgit v1.2.3 From 5a50f52917011365a4a2c0f1d2cea0990c4465cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Mar 2016 23:55:21 +0100 Subject: perf/x86/ibs: Fix race with IBS_STARTING state While tracing the IBS bits I saw the NMI hitting between clearing IBS_STARTING and the actual MSR writes to disable the counter. Since IBS_STARTING was cleared, the handler assumed these were spurious NMIs and because STOPPING wasn't set yet either, insta-triggered an "Unknown NMI". Cure this by clearing IBS_STARTING after disabling the hardware. Tested-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/amd/ibs.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 7956d29762ef..c43a36d5fd6f 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -376,7 +376,13 @@ static void perf_ibs_start(struct perf_event *event, int flags) hwc->state = 0; perf_ibs_set_period(perf_ibs, hwc, &period); + /* + * Set STARTED before enabling the hardware, such that + * a subsequent NMI must observe it. Then clear STOPPING + * such that we don't consume NMIs by accident. + */ set_bit(IBS_STARTED, pcpu->state); + clear_bit(IBS_STOPPING, pcpu->state); perf_ibs_enable_event(perf_ibs, hwc, period >> 4); perf_event_update_userpage(event); @@ -390,7 +396,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags) u64 config; int stopping; - stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); + stopping = test_bit(IBS_STARTED, pcpu->state); if (!stopping && (hwc->state & PERF_HES_UPTODATE)) return; @@ -398,8 +404,24 @@ static void perf_ibs_stop(struct perf_event *event, int flags) rdmsrl(hwc->config_base, config); if (stopping) { + /* + * Set STOPPING before disabling the hardware, such that it + * must be visible to NMIs the moment we clear the EN bit, + * at which point we can generate an !VALID sample which + * we need to consume. + */ set_bit(IBS_STOPPING, pcpu->state); perf_ibs_disable_event(perf_ibs, hwc, config); + /* + * Clear STARTED after disabling the hardware; if it were + * cleared before an NMI hitting after the clear but before + * clearing the EN bit might think it a spurious NMI and not + * handle it. + * + * Clearing it after, however, creates the problem of the NMI + * handler seeing STARTED but not having a valid sample. + */ + clear_bit(IBS_STARTED, pcpu->state); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } @@ -527,20 +549,24 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) u64 *buf, *config, period; if (!test_bit(IBS_STARTED, pcpu->state)) { +fail: /* * Catch spurious interrupts after stopping IBS: After * disabling IBS there could be still incoming NMIs * with samples that even have the valid bit cleared. * Mark all this NMIs as handled. */ - return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; + if (test_and_clear_bit(IBS_STOPPING, pcpu->state)) + return 1; + + return 0; } msr = hwc->config_base; buf = ibs_data.regs; rdmsrl(msr, *buf); if (!(*buf++ & perf_ibs->valid_mask)) - return 0; + goto fail; config = &ibs_data.regs[0]; perf_ibs_event_update(perf_ibs, event, config); -- cgit v1.2.3 From c2872d381f1ab1bc39c7f044644bf1d3da7c11dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Mar 2016 15:06:58 +0100 Subject: perf/x86/ibs: Add IBS interrupt to the dynamic throttle Interrupt throttling is normally only done against sysctl_perf_event_sample_rate. This means that if that number is too high (for whatever reason) you can lock up your machine. We have, however, a dynamic throttling scheme too, but for that to work, we need to add a callback to the interrupt handler, IBS did not have this, so add it. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/amd/ibs.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c43a36d5fd6f..3ea25c3917c0 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -637,6 +637,7 @@ out: static int perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) { + u64 stamp = sched_clock(); int handled = 0; handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); @@ -645,6 +646,8 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) if (handled) inc_irq_stat(apic_perf_irqs); + perf_sample_event_took(sched_clock() - stamp); + return handled; } NOKPROBE_SYMBOL(perf_ibs_nmi_handler); -- cgit v1.2.3 From e8d8a90fc5014e4cdcf1de2f320081d26c7f440f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Mar 2016 17:31:27 +0100 Subject: perf/x86/BTS: Fix RCU usage This splat reminds us: [ 8166.045595] [ INFO: suspicious RCU usage. ] [ 8166.168972] [] lockdep_rcu_suspicious+0xe7/0x120 [ 8166.175966] [] perf_callchain+0x23e/0x250 [ 8166.182280] [] perf_prepare_sample+0x27d/0x350 [ 8166.189082] [] intel_pmu_drain_bts_buffer+0x133/0x200 ... that as the core code does, one should hold rcu_read_lock() over that entire BTS event-output generation sequence as well. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/intel/ds.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ce7211a07c0b..8584b90d8e0b 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -570,11 +570,12 @@ int intel_pmu_drain_bts_buffer(void) * We will overwrite the from and to address before we output * the sample. */ + rcu_read_lock(); perf_prepare_sample(&header, &data, event, ®s); if (perf_output_begin(&handle, event, header.size * (top - base - skip))) - return 1; + goto unlock; for (at = base; at < top; at++) { /* Filter out any records that contain kernel addresses. */ @@ -593,6 +594,8 @@ int intel_pmu_drain_bts_buffer(void) /* There's new data available. */ event->hw.interrupts++; event->pending_kill = POLL_IN; +unlock: + rcu_read_unlock(); return 1; } -- cgit v1.2.3 From a223c1c7ab4cc64537dc4b911f760d851683768a Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Thu, 10 Mar 2016 15:32:07 -0800 Subject: perf/x86/cqm: Fix CQM handling of grouping events into a cache_group Currently CQM (cache quality of service monitoring) is grouping all events belonging to same PID to use one RMID. However its not counting all of these different events. Hence we end up with a count of zero for all events other than the group leader. The patch tries to address the issue by keeping a flag in the perf_event.hw which has other CQM related fields. The field is updated at event creation and during grouping. Signed-off-by: Vikas Shivappa [peterz: Changed hw_perf_event::is_group_event to an int] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tony Luck Acked-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Matt Fleming Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: fenghua.yu@intel.com Cc: h.peter.anvin@intel.com Cc: ravi.v.shankar@intel.com Cc: vikas.shivappa@intel.com Link: http://lkml.kernel.org/r/1457652732-4499-2-git-send-email-vikas.shivappa@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cqm.c | 13 ++++++++++--- include/linux/perf_event.h | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 93cb412a5579..b0226f1912a2 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -281,9 +281,13 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) /* * Events that target same task are placed into the same cache group. + * Mark it as a multi event group, so that we update ->count + * for every event rather than just the group leader later. */ - if (a->hw.target == b->hw.target) + if (a->hw.target == b->hw.target) { + b->hw.is_group_event = true; return true; + } /* * Are we an inherited event? @@ -849,6 +853,7 @@ static void intel_cqm_setup_event(struct perf_event *event, bool conflict = false; u32 rmid; + event->hw.is_group_event = false; list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { rmid = iter->hw.cqm_rmid; @@ -940,7 +945,9 @@ static u64 intel_cqm_event_count(struct perf_event *event) return __perf_event_count(event); /* - * Only the group leader gets to report values. This stops us + * Only the group leader gets to report values except in case of + * multiple events in the same group, we still need to read the + * other events.This stops us * reporting duplicate values to userspace, and gives us a clear * rule for which task gets to report the values. * @@ -948,7 +955,7 @@ static u64 intel_cqm_event_count(struct perf_event *event) * specific packages - we forfeit that ability when we create * task events. */ - if (!cqm_group_leader(event)) + if (!cqm_group_leader(event) && !event->hw.is_group_event) return 0; /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 79ec7bbf0155..7bb315bec3aa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -121,6 +121,7 @@ struct hw_perf_event { struct { /* intel_cqm */ int cqm_state; u32 cqm_rmid; + int is_group_event; struct list_head cqm_events_entry; struct list_head cqm_groups_entry; struct list_head cqm_group_entry; -- cgit v1.2.3 From ada2f634cd50d050269b67b4e2966582387e7c27 Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Thu, 10 Mar 2016 15:32:08 -0800 Subject: perf/x86/cqm: Fix CQM memory leak and notifier leak Fixes the hotcpu notifier leak and other global variable memory leaks during CQM (cache quality of service monitoring) initialization. Signed-off-by: Vikas Shivappa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tony Luck Acked-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Matt Fleming Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: fenghua.yu@intel.com Cc: h.peter.anvin@intel.com Cc: ravi.v.shankar@intel.com Cc: vikas.shivappa@intel.com Link: http://lkml.kernel.org/r/1457652732-4499-3-git-send-email-vikas.shivappa@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cqm.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index b0226f1912a2..dbb058d2910a 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -211,6 +211,20 @@ static void __put_rmid(u32 rmid) list_add_tail(&entry->list, &cqm_rmid_limbo_lru); } +static void cqm_cleanup(void) +{ + int i; + + if (!cqm_rmid_ptrs) + return; + + for (i = 0; i < cqm_max_rmid; i++) + kfree(cqm_rmid_ptrs[i]); + + kfree(cqm_rmid_ptrs); + cqm_rmid_ptrs = NULL; +} + static int intel_cqm_setup_rmid_cache(void) { struct cqm_rmid_entry *entry; @@ -218,7 +232,7 @@ static int intel_cqm_setup_rmid_cache(void) int r = 0; nr_rmids = cqm_max_rmid + 1; - cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) * + cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * nr_rmids, GFP_KERNEL); if (!cqm_rmid_ptrs) return -ENOMEM; @@ -249,11 +263,9 @@ static int intel_cqm_setup_rmid_cache(void) mutex_unlock(&cache_mutex); return 0; -fail: - while (r--) - kfree(cqm_rmid_ptrs[r]); - kfree(cqm_rmid_ptrs); +fail: + cqm_cleanup(); return -ENOMEM; } @@ -1312,7 +1324,7 @@ static const struct x86_cpu_id intel_cqm_match[] = { static int __init intel_cqm_init(void) { - char *str, scale[20]; + char *str = NULL, scale[20]; int i, cpu, ret; if (!x86_match_cpu(intel_cqm_match)) @@ -1372,16 +1384,25 @@ static int __init intel_cqm_init(void) cqm_pick_event_reader(i); } - __perf_cpu_notifier(intel_cqm_cpu_notifier); - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - if (ret) + if (ret) { pr_err("Intel CQM perf registration failed: %d\n", ret); - else - pr_info("Intel CQM monitoring enabled\n"); + goto out; + } + pr_info("Intel CQM monitoring enabled\n"); + + /* + * Register the hot cpu notifier once we are sure cqm + * is enabled to avoid notifier leak. + */ + __perf_cpu_notifier(intel_cqm_cpu_notifier); out: cpu_notifier_register_done(); + if (ret) { + kfree(str); + cqm_cleanup(); + } return ret; } -- cgit v1.2.3 From 33c3cc7acfd95968d74247f1a4e1b0727a07ed43 Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Thu, 10 Mar 2016 15:32:09 -0800 Subject: perf/x86/mbm: Add Intel Memory B/W Monitoring enumeration and init The MBM init patch enumerates the Intel MBM (Memory b/w monitoring) and initializes the perf events and datastructures for monitoring the memory b/w. Its based on original patch series by Tony Luck and Kanaka Juvva. Memory bandwidth monitoring (MBM) provides OS/VMM a way to monitor bandwidth from one level of cache to another. The current patches support L3 external bandwidth monitoring. It supports both 'local bandwidth' and 'total bandwidth' monitoring for the socket. Local bandwidth measures the amount of data sent through the memory controller on the socket and total b/w measures the total system bandwidth. Extending the cache quality of service monitoring (CQM) we add two more events to the perf infrastructure: intel_cqm_llc/local_bytes - bytes sent through local socket memory controller intel_cqm_llc/total_bytes - total L3 external bytes sent The tasks are associated with a Resouce Monitoring ID (RMID) just like in CQM and OS uses a MSR write to indicate the RMID of the task during scheduling. Signed-off-by: Vikas Shivappa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tony Luck Acked-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Matt Fleming Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: fenghua.yu@intel.com Cc: h.peter.anvin@intel.com Cc: ravi.v.shankar@intel.com Cc: vikas.shivappa@intel.com Link: http://lkml.kernel.org/r/1457652732-4499-4-git-send-email-vikas.shivappa@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cqm.c | 126 ++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/kernel/cpu/common.c | 4 +- 3 files changed, 128 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index dbb058d2910a..515df11e65bb 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -15,6 +15,7 @@ static u32 cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ +static bool cqm_enabled, mbm_enabled; /** * struct intel_pqr_state - State cache for the PQR MSR @@ -42,6 +43,24 @@ struct intel_pqr_state { * interrupts disabled, which is sufficient for the protection. */ static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); +/** + * struct sample - mbm event's (local or total) data + * @total_bytes #bytes since we began monitoring + * @prev_msr previous value of MSR + */ +struct sample { + u64 total_bytes; + u64 prev_msr; +}; + +/* + * samples profiled for total memory bandwidth type events + */ +static struct sample *mbm_total; +/* + * samples profiled for local memory bandwidth type events + */ +static struct sample *mbm_local; /* * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. @@ -223,6 +242,7 @@ static void cqm_cleanup(void) kfree(cqm_rmid_ptrs); cqm_rmid_ptrs = NULL; + cqm_enabled = false; } static int intel_cqm_setup_rmid_cache(void) @@ -1164,6 +1184,16 @@ EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); +EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02"); +EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1"); +EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB"); +EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6"); + +EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03"); +EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1"); +EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB"); +EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6"); + static struct attribute *intel_cqm_events_attr[] = { EVENT_PTR(intel_cqm_llc), EVENT_PTR(intel_cqm_llc_pkg), @@ -1173,9 +1203,38 @@ static struct attribute *intel_cqm_events_attr[] = { NULL, }; +static struct attribute *intel_mbm_events_attr[] = { + EVENT_PTR(intel_cqm_total_bytes), + EVENT_PTR(intel_cqm_local_bytes), + EVENT_PTR(intel_cqm_total_bytes_pkg), + EVENT_PTR(intel_cqm_local_bytes_pkg), + EVENT_PTR(intel_cqm_total_bytes_unit), + EVENT_PTR(intel_cqm_local_bytes_unit), + EVENT_PTR(intel_cqm_total_bytes_scale), + EVENT_PTR(intel_cqm_local_bytes_scale), + NULL, +}; + +static struct attribute *intel_cmt_mbm_events_attr[] = { + EVENT_PTR(intel_cqm_llc), + EVENT_PTR(intel_cqm_total_bytes), + EVENT_PTR(intel_cqm_local_bytes), + EVENT_PTR(intel_cqm_llc_pkg), + EVENT_PTR(intel_cqm_total_bytes_pkg), + EVENT_PTR(intel_cqm_local_bytes_pkg), + EVENT_PTR(intel_cqm_llc_unit), + EVENT_PTR(intel_cqm_total_bytes_unit), + EVENT_PTR(intel_cqm_local_bytes_unit), + EVENT_PTR(intel_cqm_llc_scale), + EVENT_PTR(intel_cqm_total_bytes_scale), + EVENT_PTR(intel_cqm_local_bytes_scale), + EVENT_PTR(intel_cqm_llc_snapshot), + NULL, +}; + static struct attribute_group intel_cqm_events_group = { .name = "events", - .attrs = intel_cqm_events_attr, + .attrs = NULL, }; PMU_FORMAT_ATTR(event, "config:0-7"); @@ -1322,12 +1381,57 @@ static const struct x86_cpu_id intel_cqm_match[] = { {} }; +static void mbm_cleanup(void) +{ + if (!mbm_enabled) + return; + + kfree(mbm_local); + kfree(mbm_total); + mbm_enabled = false; +} + +static const struct x86_cpu_id intel_mbm_local_match[] = { + { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL }, + {} +}; + +static const struct x86_cpu_id intel_mbm_total_match[] = { + { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL }, + {} +}; + +static int intel_mbm_init(void) +{ + int array_size, maxid = cqm_max_rmid + 1; + + array_size = sizeof(struct sample) * maxid * topology_max_packages(); + mbm_local = kmalloc(array_size, GFP_KERNEL); + if (!mbm_local) + return -ENOMEM; + + mbm_total = kmalloc(array_size, GFP_KERNEL); + if (!mbm_total) { + mbm_cleanup(); + return -ENOMEM; + } + + return 0; +} + static int __init intel_cqm_init(void) { char *str = NULL, scale[20]; int i, cpu, ret; - if (!x86_match_cpu(intel_cqm_match)) + if (x86_match_cpu(intel_cqm_match)) + cqm_enabled = true; + + if (x86_match_cpu(intel_mbm_local_match) && + x86_match_cpu(intel_mbm_total_match)) + mbm_enabled = true; + + if (!cqm_enabled && !mbm_enabled) return -ENODEV; cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; @@ -1384,13 +1488,28 @@ static int __init intel_cqm_init(void) cqm_pick_event_reader(i); } + if (mbm_enabled) + ret = intel_mbm_init(); + if (ret && !cqm_enabled) + goto out; + + if (cqm_enabled && mbm_enabled) + intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr; + else if (!cqm_enabled && mbm_enabled) + intel_cqm_events_group.attrs = intel_mbm_events_attr; + else if (cqm_enabled && !mbm_enabled) + intel_cqm_events_group.attrs = intel_cqm_events_attr; + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); if (ret) { pr_err("Intel CQM perf registration failed: %d\n", ret); goto out; } - pr_info("Intel CQM monitoring enabled\n"); + if (cqm_enabled) + pr_info("Intel CQM monitoring enabled\n"); + if (mbm_enabled) + pr_info("Intel MBM enabled\n"); /* * Register the hot cpu notifier once we are sure cqm @@ -1402,6 +1521,7 @@ out: if (ret) { kfree(str); cqm_cleanup(); + mbm_cleanup(); } return ret; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 074b7604bd51..746dd6ae4932 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -245,6 +245,8 @@ /* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ #define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ +#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 62590aa064c8..e601c1286e29 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -649,7 +649,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_F_1_EDX] = edx; - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) { + if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || + ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || + (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; } -- cgit v1.2.3 From 87f01cc2a2914b61ade5ec834377fa7819484173 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 11 Mar 2016 11:26:11 -0800 Subject: perf/x86/mbm: Add memory bandwidth monitoring event management Includes all the core infrastructure to measure the total_bytes and bandwidth. We have per socket counters for both total system wide L3 external bytes and local socket memory-controller bytes. The OS does MSR writes to MSR_IA32_QM_EVTSEL and MSR_IA32_QM_CTR to read the counters and uses the IA32_PQR_ASSOC_MSR to associate the RMID with the task. The tasks have a common RMID for CQM (cache quality of service monitoring) and MBM. Hence most of the scheduling code is reused from CQM. Signed-off-by: Tony Luck [ Restructured rmid_read to not have an obvious hole, removed MBM_CNTR_MAX as its unused. ] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vikas Shivappa Acked-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Matt Fleming Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: fenghua.yu@intel.com Cc: h.peter.anvin@intel.com Cc: ravi.v.shankar@intel.com Cc: vikas.shivappa@intel.com Link: http://lkml.kernel.org/r/abd7aac9a18d93b95b985b931cf258df0164746d.1457723885.git.tony.luck@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cqm.c | 122 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 116 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 515df11e65bb..610bd8ab37e4 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -13,6 +13,8 @@ #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d +#define MBM_CNTR_WIDTH 24 + static u32 cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ static bool cqm_enabled, mbm_enabled; @@ -62,6 +64,16 @@ static struct sample *mbm_total; */ static struct sample *mbm_local; +#define pkg_id topology_physical_package_id(smp_processor_id()) +/* + * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. + * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of + * rmids per socket, an example is given below + * RMID1 of Socket0: vrmid = 1 + * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 + * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 + */ +#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) /* * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. * Also protects event->hw.cqm_rmid @@ -84,9 +96,13 @@ static cpumask_t cqm_cpumask; #define RMID_VAL_ERROR (1ULL << 63) #define RMID_VAL_UNAVAIL (1ULL << 62) -#define QOS_L3_OCCUP_EVENT_ID (1 << 0) - -#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_MBM_LOCAL_EVENT_ID 0x03 /* * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). @@ -428,10 +444,17 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b) struct rmid_read { u32 rmid; + u32 evt_type; atomic64_t value; }; static void __intel_cqm_event_count(void *info); +static void init_mbm_sample(u32 rmid, u32 evt_type); + +static bool is_mbm_event(int e) +{ + return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); +} /* * Exchange the RMID of a group of events. @@ -873,6 +896,68 @@ static void intel_cqm_rmid_rotate(struct work_struct *work) schedule_delayed_work(&intel_cqm_rmid_work, delay); } +static u64 update_sample(unsigned int rmid, u32 evt_type, int first) +{ + struct sample *mbm_current; + u32 vrmid = rmid_2_index(rmid); + u64 val, bytes, shift; + u32 eventid; + + if (evt_type == QOS_MBM_LOCAL_EVENT_ID) { + mbm_current = &mbm_local[vrmid]; + eventid = QOS_MBM_LOCAL_EVENT_ID; + } else { + mbm_current = &mbm_total[vrmid]; + eventid = QOS_MBM_TOTAL_EVENT_ID; + } + + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return mbm_current->total_bytes; + + if (first) { + mbm_current->prev_msr = val; + mbm_current->total_bytes = 0; + return mbm_current->total_bytes; + } + + shift = 64 - MBM_CNTR_WIDTH; + bytes = (val << shift) - (mbm_current->prev_msr << shift); + bytes >>= shift; + + bytes *= cqm_l3_scale; + + mbm_current->total_bytes += bytes; + mbm_current->prev_msr = val; + + return mbm_current->total_bytes; +} + +static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type) +{ + return update_sample(rmid, evt_type, 0); +} + +static void __intel_mbm_event_init(void *info) +{ + struct rmid_read *rr = info; + + update_sample(rr->rmid, rr->evt_type, 1); +} + +static void init_mbm_sample(u32 rmid, u32 evt_type) +{ + struct rmid_read rr = { + .rmid = rmid, + .evt_type = evt_type, + .value = ATOMIC64_INIT(0), + }; + + /* on each socket, init sample */ + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); +} + /* * Find a group and setup RMID. * @@ -893,6 +978,8 @@ static void intel_cqm_setup_event(struct perf_event *event, /* All tasks in a group share an RMID */ event->hw.cqm_rmid = rmid; *group = iter; + if (is_mbm_event(event->attr.config)) + init_mbm_sample(rmid, event->attr.config); return; } @@ -909,6 +996,9 @@ static void intel_cqm_setup_event(struct perf_event *event, else rmid = __get_rmid(); + if (is_mbm_event(event->attr.config)) + init_mbm_sample(rmid, event->attr.config); + event->hw.cqm_rmid = rmid; } @@ -930,7 +1020,10 @@ static void intel_cqm_event_read(struct perf_event *event) if (!__rmid_valid(rmid)) goto out; - val = __rmid_read(rmid); + if (is_mbm_event(event->attr.config)) + val = rmid_read_mbm(rmid, event->attr.config); + else + val = __rmid_read(rmid); /* * Ignore this reading on error states and do not update the value. @@ -961,6 +1054,17 @@ static inline bool cqm_group_leader(struct perf_event *event) return !list_empty(&event->hw.cqm_groups_entry); } +static void __intel_mbm_event_count(void *info) +{ + struct rmid_read *rr = info; + u64 val; + + val = rmid_read_mbm(rr->rmid, rr->evt_type); + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + atomic64_add(val, &rr->value); +} + static u64 intel_cqm_event_count(struct perf_event *event) { unsigned long flags; @@ -1014,7 +1118,12 @@ static u64 intel_cqm_event_count(struct perf_event *event) if (!__rmid_valid(rr.rmid)) goto out; - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + if (is_mbm_event(event->attr.config)) { + rr.evt_type = event->attr.config; + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, &rr, 1); + } else { + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + } raw_spin_lock_irqsave(&cache_lock, flags); if (event->hw.cqm_rmid == rr.rmid) @@ -1129,7 +1238,8 @@ static int intel_cqm_event_init(struct perf_event *event) if (event->attr.type != intel_cqm_pmu.type) return -ENOENT; - if (event->attr.config & ~QOS_EVENT_MASK) + if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || + (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) return -EINVAL; /* unsupported modes and filters */ -- cgit v1.2.3 From 2d4de8376ff1d94a5070cfa9092c59bfdc4e693e Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Thu, 10 Mar 2016 15:32:11 -0800 Subject: perf/x86/mbm: Implement RMID recycling RMID could be allocated or deallocated as part of RMID recycling. When an RMID is allocated for MBM event, the MBM counter needs to be initialized because next time we read the counter we need the previous value to account for total bytes that went to the memory controller. Similarly, when RMID is deallocated we need to update the ->count variable. Signed-off-by: Vikas Shivappa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tony Luck Acked-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Matt Fleming Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: fenghua.yu@intel.com Cc: h.peter.anvin@intel.com Cc: ravi.v.shankar@intel.com Cc: vikas.shivappa@intel.com Link: http://lkml.kernel.org/r/1457652732-4499-6-git-send-email-vikas.shivappa@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cqm.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 610bd8ab37e4..a98f472bf6b2 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -450,6 +450,7 @@ struct rmid_read { static void __intel_cqm_event_count(void *info); static void init_mbm_sample(u32 rmid, u32 evt_type); +static void __intel_mbm_event_count(void *info); static bool is_mbm_event(int e) { @@ -476,8 +477,14 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) .rmid = old_rmid, }; - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, - &rr, 1); + if (is_mbm_event(group->attr.config)) { + rr.evt_type = group->attr.config; + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, + &rr, 1); + } else { + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, + &rr, 1); + } local64_set(&group->count, atomic64_read(&rr.value)); } @@ -489,6 +496,22 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) raw_spin_unlock_irq(&cache_lock); + /* + * If the allocation is for mbm, init the mbm stats. + * Need to check if each event in the group is mbm event + * because there could be multiple type of events in the same group. + */ + if (__rmid_valid(rmid)) { + event = group; + if (is_mbm_event(event->attr.config)) + init_mbm_sample(rmid, event->attr.config); + + list_for_each_entry(event, head, hw.cqm_group_entry) { + if (is_mbm_event(event->attr.config)) + init_mbm_sample(rmid, event->attr.config); + } + } + return old_rmid; } @@ -978,7 +1001,7 @@ static void intel_cqm_setup_event(struct perf_event *event, /* All tasks in a group share an RMID */ event->hw.cqm_rmid = rmid; *group = iter; - if (is_mbm_event(event->attr.config)) + if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) init_mbm_sample(rmid, event->attr.config); return; } @@ -996,7 +1019,7 @@ static void intel_cqm_setup_event(struct perf_event *event, else rmid = __get_rmid(); - if (is_mbm_event(event->attr.config)) + if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) init_mbm_sample(rmid, event->attr.config); event->hw.cqm_rmid = rmid; -- cgit v1.2.3 From e7ee3e8cb550ce43752ae1d1b190d6b5c4150a43 Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 11 Mar 2016 11:26:17 -0800 Subject: perf/x86/mbm: Add support for MBM counter overflow handling This patch adds a per package timer which periodically updates the memory bandwidth counters for the events that are currently active. Current patch has a periodic timer every 1s since the SDM guarantees that the counter will not overflow in 1s but this time can be definitely improved by calibrating on the system. The overflow is really a function of the max memory b/w that the socket can support, max counter value and scaling factor. Signed-off-by: Vikas Shivappa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Tony Luck Acked-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Matt Fleming Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: fenghua.yu@intel.com Cc: h.peter.anvin@intel.com Cc: ravi.v.shankar@intel.com Cc: vikas.shivappa@intel.com Link: http://lkml.kernel.org/r/013b756c5006b1c4ca411f3ecf43ed52f19fbf87.1457723885.git.tony.luck@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cqm.c | 139 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 134 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index a98f472bf6b2..380d62da8108 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -14,10 +14,15 @@ #define MSR_IA32_QM_EVTSEL 0x0c8d #define MBM_CNTR_WIDTH 24 +/* + * Guaranteed time in ms as per SDM where MBM counters will not overflow. + */ +#define MBM_CTR_OVERFLOW_TIME 1000 static u32 cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ static bool cqm_enabled, mbm_enabled; +unsigned int mbm_socket_max; /** * struct intel_pqr_state - State cache for the PQR MSR @@ -45,6 +50,7 @@ struct intel_pqr_state { * interrupts disabled, which is sufficient for the protection. */ static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); +static struct hrtimer *mbm_timers; /** * struct sample - mbm event's (local or total) data * @total_bytes #bytes since we began monitoring @@ -945,6 +951,10 @@ static u64 update_sample(unsigned int rmid, u32 evt_type, int first) return mbm_current->total_bytes; } + /* + * The h/w guarantees that counters will not overflow + * so long as we poll them at least once per second. + */ shift = 64 - MBM_CNTR_WIDTH; bytes = (val << shift) - (mbm_current->prev_msr << shift); bytes >>= shift; @@ -1088,6 +1098,84 @@ static void __intel_mbm_event_count(void *info) atomic64_add(val, &rr->value); } +static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct perf_event *iter, *iter1; + int ret = HRTIMER_RESTART; + struct list_head *head; + unsigned long flags; + u32 grp_rmid; + + /* + * Need to cache_lock as the timer Event Select MSR reads + * can race with the mbm/cqm count() and mbm_init() reads. + */ + raw_spin_lock_irqsave(&cache_lock, flags); + + if (list_empty(&cache_groups)) { + ret = HRTIMER_NORESTART; + goto out; + } + + list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { + grp_rmid = iter->hw.cqm_rmid; + if (!__rmid_valid(grp_rmid)) + continue; + if (is_mbm_event(iter->attr.config)) + update_sample(grp_rmid, iter->attr.config, 0); + + head = &iter->hw.cqm_group_entry; + if (list_empty(head)) + continue; + list_for_each_entry(iter1, head, hw.cqm_group_entry) { + if (!iter1->hw.is_group_event) + break; + if (is_mbm_event(iter1->attr.config)) + update_sample(iter1->hw.cqm_rmid, + iter1->attr.config, 0); + } + } + + hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME)); +out: + raw_spin_unlock_irqrestore(&cache_lock, flags); + + return ret; +} + +static void __mbm_start_timer(void *info) +{ + hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME), + HRTIMER_MODE_REL_PINNED); +} + +static void __mbm_stop_timer(void *info) +{ + hrtimer_cancel(&mbm_timers[pkg_id]); +} + +static void mbm_start_timers(void) +{ + on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1); +} + +static void mbm_stop_timers(void) +{ + on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1); +} + +static void mbm_hrtimer_init(void) +{ + struct hrtimer *hr; + int i; + + for (i = 0; i < mbm_socket_max; i++) { + hr = &mbm_timers[i]; + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = mbm_hrtimer_handle; + } +} + static u64 intel_cqm_event_count(struct perf_event *event) { unsigned long flags; @@ -1217,8 +1305,14 @@ static int intel_cqm_event_add(struct perf_event *event, int mode) static void intel_cqm_event_destroy(struct perf_event *event) { struct perf_event *group_other = NULL; + unsigned long flags; mutex_lock(&cache_mutex); + /* + * Hold the cache_lock as mbm timer handlers could be + * scanning the list of events. + */ + raw_spin_lock_irqsave(&cache_lock, flags); /* * If there's another event in this group... @@ -1250,6 +1344,14 @@ static void intel_cqm_event_destroy(struct perf_event *event) } } + raw_spin_unlock_irqrestore(&cache_lock, flags); + + /* + * Stop the mbm overflow timers when the last event is destroyed. + */ + if (mbm_enabled && list_empty(&cache_groups)) + mbm_stop_timers(); + mutex_unlock(&cache_mutex); } @@ -1257,6 +1359,7 @@ static int intel_cqm_event_init(struct perf_event *event) { struct perf_event *group = NULL; bool rotate = false; + unsigned long flags; if (event->attr.type != intel_cqm_pmu.type) return -ENOENT; @@ -1282,9 +1385,21 @@ static int intel_cqm_event_init(struct perf_event *event) mutex_lock(&cache_mutex); + /* + * Start the mbm overflow timers when the first event is created. + */ + if (mbm_enabled && list_empty(&cache_groups)) + mbm_start_timers(); + /* Will also set rmid */ intel_cqm_setup_event(event, &group); + /* + * Hold the cache_lock as mbm timer handlers be + * scanning the list of events. + */ + raw_spin_lock_irqsave(&cache_lock, flags); + if (group) { list_add_tail(&event->hw.cqm_group_entry, &group->hw.cqm_group_entry); @@ -1303,6 +1418,7 @@ static int intel_cqm_event_init(struct perf_event *event) rotate = true; } + raw_spin_unlock_irqrestore(&cache_lock, flags); mutex_unlock(&cache_mutex); if (rotate) @@ -1536,20 +1652,33 @@ static const struct x86_cpu_id intel_mbm_total_match[] = { static int intel_mbm_init(void) { - int array_size, maxid = cqm_max_rmid + 1; + int ret = 0, array_size, maxid = cqm_max_rmid + 1; - array_size = sizeof(struct sample) * maxid * topology_max_packages(); + mbm_socket_max = topology_max_packages(); + array_size = sizeof(struct sample) * maxid * mbm_socket_max; mbm_local = kmalloc(array_size, GFP_KERNEL); if (!mbm_local) return -ENOMEM; mbm_total = kmalloc(array_size, GFP_KERNEL); if (!mbm_total) { - mbm_cleanup(); - return -ENOMEM; + ret = -ENOMEM; + goto out; } - return 0; + array_size = sizeof(struct hrtimer) * mbm_socket_max; + mbm_timers = kmalloc(array_size, GFP_KERNEL); + if (!mbm_timers) { + ret = -ENOMEM; + goto out; + } + mbm_hrtimer_init(); + +out: + if (ret) + mbm_cleanup(); + + return ret; } static int __init intel_cqm_init(void) -- cgit v1.2.3 From 27348f382b6786fd201779246ee70fa115a5b890 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Mar 2016 23:39:39 +0100 Subject: perf/x86/cqm: Factor out some common code Having the same code twice (and once quite ugly) is fragile. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/x86/events/intel/cqm.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 380d62da8108..7b5fd811ef45 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -463,6 +463,14 @@ static bool is_mbm_event(int e) return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); } +static void cqm_mask_call(struct rmid_read *rr) +{ + if (is_mbm_event(rr->evt_type)) + on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1); + else + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1); +} + /* * Exchange the RMID of a group of events. */ @@ -479,18 +487,12 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) */ if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { struct rmid_read rr = { - .value = ATOMIC64_INIT(0), .rmid = old_rmid, + .evt_type = group->attr.config, + .value = ATOMIC64_INIT(0), }; - if (is_mbm_event(group->attr.config)) { - rr.evt_type = group->attr.config; - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, - &rr, 1); - } else { - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, - &rr, 1); - } + cqm_mask_call(&rr); local64_set(&group->count, atomic64_read(&rr.value)); } @@ -1180,6 +1182,7 @@ static u64 intel_cqm_event_count(struct perf_event *event) { unsigned long flags; struct rmid_read rr = { + .evt_type = event->attr.config, .value = ATOMIC64_INIT(0), }; @@ -1229,12 +1232,7 @@ static u64 intel_cqm_event_count(struct perf_event *event) if (!__rmid_valid(rr.rmid)) goto out; - if (is_mbm_event(event->attr.config)) { - rr.evt_type = event->attr.config; - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, &rr, 1); - } else { - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); - } + cqm_mask_call(&rr); raw_spin_lock_irqsave(&cache_lock, flags); if (event->hw.cqm_rmid == rr.rmid) -- cgit v1.2.3 From 8dfeae0d73bf803be1a533e147b3b0ea69375596 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Thu, 14 Jan 2016 10:50:04 +0800 Subject: perf/x86/amd: Move nodes_per_socket into bsp_init_amd() nodes_per_socket is static and it needn't be initialized many times during every CPU core init. So move its initialization into bsp_init_amd(). Signed-off-by: Huang Rui Signed-off-by: Borislav Petkov Cc: Aaron Lu Cc: Alexander Shishkin Cc: Andreas Herrmann Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: Fengguang Wu Cc: Frédéric Weisbecker Cc: Guenter Roeck Cc: H. Peter Anvin Cc: Hector Marco-Gisbert Cc: Jacob Shin Cc: Jiri Olsa Cc: John Stultz Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: Vince Weaver Cc: spg_linux_kernel@amd.com Link: http://lkml.kernel.org/r/1452739808-11871-2-git-send-email-ray.huang@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b39338c4b260..d4b06e8cd77c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -306,7 +306,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - nodes_per_socket = ((ecx >> 8) & 7) + 1; node_id = ecx & 7; /* get compute unit information */ @@ -317,7 +316,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; node_id = value & 7; } else return; @@ -519,6 +517,18 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); + + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + u32 ecx; + + ecx = cpuid_ecx(0x8000001e); + nodes_per_socket = ((ecx >> 8) & 7) + 1; + } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes_per_socket = ((value >> 3) & 7) + 1; + } } static void early_init_amd(struct cpuinfo_x86 *c) -- cgit v1.2.3 From f8519155b4d5224e215203bf0e94a8478a8f8945 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Sun, 28 Feb 2016 22:23:29 -0600 Subject: perf/x86/amd: Add support for new IOMMU performance events This patch adds new IOMMU performance event based on the information in table 74 of the AMD I/O Virtualization Technology (IOMMU) Specification (Document Id: 4882, Rev 2.62, Feb 2015) Signed-off-by: Suravee Suthikulpanit Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Joerg Roedel Acked-by: Joerg Roedel Cc: Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://support.amd.com/TechDocs/48882_IOMMU.pdf Signed-off-by: Ingo Molnar --- arch/x86/events/amd/iommu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 635e5eba0caf..40625ca7a190 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -118,6 +118,11 @@ static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = { AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"), AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"), AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"), + AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h, "csource=0x14"), + AMD_IOMMU_EVENT_DESC(vapic_int_non_guest, "csource=0x15"), + AMD_IOMMU_EVENT_DESC(vapic_int_guest, "csource=0x16"), + AMD_IOMMU_EVENT_DESC(smi_recv, "csource=0x17"), + AMD_IOMMU_EVENT_DESC(smi_blk, "csource=0x18"), { /* end: all zeroes */ }, }; -- cgit v1.2.3 From 01fe03ff1c7ce6b6e2212cb6171a49c2858fbc7c Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Thu, 14 Jan 2016 10:50:06 +0800 Subject: x86/cpufeature, perf/x86: Add AMD Accumulated Power Mechanism feature flag AMD CPU family 15h model 0x60 introduces a mechanism for measuring accumulated power. It is used to report the processor power consumption and support for it is indicated by CPUID Fn8000_0007_EDX[12]. Signed-off-by: Huang Rui Signed-off-by: Borislav Petkov Cc: Aaron Lu Cc: Alexander Shishkin Cc: Andreas Herrmann Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: Fengguang Wu Cc: Frédéric Weisbecker Cc: Guenter Roeck Cc: H. Peter Anvin Cc: Hector Marco-Gisbert Cc: Jacob Shin Cc: Jiri Olsa Cc: John Stultz Cc: Kristen Carlson Accardi Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Robert Richter Cc: Ross Zwisler Cc: Stephane Eranian Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wan Zongshun Cc: spg_linux_kernel@amd.com Link: http://lkml.kernel.org/r/1452739808-11871-4-git-send-email-ray.huang@amd.com [ Resolved conflict and moved the synthetic CPUID slot to 19. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/amd.c | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 746dd6ae4932..44ebd04878eb 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -94,7 +94,7 @@ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ #define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ #define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d4b06e8cd77c..68fe8d3bed56 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -546,6 +546,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_sched_clock_stable(); } + /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ + if (c->x86_power & BIT(12)) + set_cpu_cap(c, X86_FEATURE_ACC_POWER); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); #else -- cgit v1.2.3 From c7ab62bfbe0e27ef452d19d88b083f01e99f13a7 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Wed, 9 Mar 2016 13:45:06 +0800 Subject: perf/x86/amd/power: Add AMD accumulated power reporting mechanism Introduce an AMD accumlated power reporting mechanism for the Family 15h, Model 60h processor that can be used to calculate the average power consumed by a processor during a measurement interval. The feature support is indicated by CPUID Fn8000_0007_EDX[12]. This feature will be implemented both in hwmon and perf. The current design provides one event to report per package/processor power consumption by counting each compute unit power value. Here the gory details of how the computation is done: * Tsample: compute unit power accumulator sample period * Tref: the PTSC counter period (PTSC: performance timestamp counter) * N: the ratio of compute unit power accumulator sample period to the PTSC period * Jmax: max compute unit accumulated power which is indicated by MSR_C001007b[MaxCpuSwPwrAcc] * Jx/Jy: compute unit accumulated power which is indicated by MSR_C001007a[CpuSwPwrAcc] * Tx/Ty: the value of performance timestamp counter which is indicated by CU_PTSC MSR_C0010280[PTSC] * PwrCPUave: CPU average power i. Determine the ratio of Tsample to Tref by executing CPUID Fn8000_0007. N = value of CPUID Fn8000_0007_ECX[CpuPwrSampleTimeRatio[15:0]]. ii. Read the full range of the cumulative energy value from the new MSR MaxCpuSwPwrAcc. Jmax = value returned. iii. At time x, software reads CpuSwPwrAcc and samples the PTSC. Jx = value read from CpuSwPwrAcc and Tx = value read from PTSC. iv. At time y, software reads CpuSwPwrAcc and samples the PTSC. Jy = value read from CpuSwPwrAcc and Ty = value read from PTSC. v. Calculate the average power consumption for a compute unit over time period (y-x). Unit of result is uWatt: if (Jy < Jx) // Rollover has occurred Jdelta = (Jy + Jmax) - Jx else Jdelta = Jy - Jx PwrCPUave = N * Jdelta * 1000 / (Ty - Tx) Simple example: root@hr-zp:/home/ray/tip# ./tools/perf/perf stat -a -e 'power/power-pkg/' make -j4 CHK include/config/kernel.release CHK include/generated/uapi/linux/version.h CHK include/generated/utsrelease.h CHK include/generated/timeconst.h CHK include/generated/bounds.h CHK include/generated/asm-offsets.h CALL scripts/checksyscalls.sh CHK include/generated/compile.h SKIPPED include/generated/compile.h Building modules, stage 2. Kernel: arch/x86/boot/bzImage is ready (#40) MODPOST 4225 modules Performance counter stats for 'system wide': 183.44 mWatts power/power-pkg/ 341.837270111 seconds time elapsed root@hr-zp:/home/ray/tip# ./tools/perf/perf stat -a -e 'power/power-pkg/' sleep 10 Performance counter stats for 'system wide': 0.18 mWatts power/power-pkg/ 10.012551815 seconds time elapsed Suggested-by: Peter Zijlstra Suggested-by: Ingo Molnar Suggested-by: Borislav Petkov Signed-off-by: Huang Rui Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: David Ahern Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Namhyung Kim Cc: Robert Richter Cc: Stephane Eranian Cc: Vince Weaver Cc: jacob.w.shin@gmail.com Link: http://lkml.kernel.org/r/1457502306-2559-1-git-send-email-ray.huang@amd.com [ Fixed the modular build. ] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 9 ++ arch/x86/events/Makefile | 1 + arch/x86/events/amd/power.c | 353 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/events/core.c | 4 +- include/linux/perf_event.h | 4 + 5 files changed, 369 insertions(+), 2 deletions(-) create mode 100644 arch/x86/events/amd/power.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8f2e6659281b..a313c0e7e165 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1206,6 +1206,15 @@ config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE +config PERF_EVENTS_AMD_POWER + depends on PERF_EVENTS && CPU_SUP_AMD + tristate "AMD Processor Power Reporting Mechanism" + ---help--- + Provide power reporting mechanism support for AMD processors. + Currently, it leverages X86_FEATURE_ACC_POWER + (CPUID Fn8000_0007_EDX[12]) interface to calculate the + average power consumption on Family 15h processors. + config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" ---help--- diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index fdfea1511cc0..f59618a39990 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,6 +1,7 @@ obj-y += core.o obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o +obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += amd/power.o obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o ifdef CONFIG_AMD_IOMMU obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c new file mode 100644 index 000000000000..55a3529dbf12 --- /dev/null +++ b/arch/x86/events/amd/power.c @@ -0,0 +1,353 @@ +/* + * Performance events - AMD Processor Power Reporting Mechanism + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Huang Rui + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include "../perf_event.h" + +#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a +#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b +#define MSR_F15H_PTSC 0xc0010280 + +/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */ +#define AMD_POWER_EVENT_MASK 0xFFULL + +/* + * Accumulated power status counters. + */ +#define AMD_POWER_EVENTSEL_PKG 1 + +/* + * The ratio of compute unit power accumulator sample period to the + * PTSC period. + */ +static unsigned int cpu_pwr_sample_ratio; + +/* Maximum accumulated power of a compute unit. */ +static u64 max_cu_acc_power; + +static struct pmu pmu_class; + +/* + * Accumulated power represents the sum of each compute unit's (CU) power + * consumption. On any core of each CU we read the total accumulated power from + * MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores + * which are picked to measure the power for the CUs they belong to. + */ +static cpumask_t cpu_mask; + +static void event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc; + u64 delta, tdelta; + + prev_pwr_acc = hwc->pwr_acc; + prev_ptsc = hwc->ptsc; + rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc); + rdmsrl(MSR_F15H_PTSC, new_ptsc); + + /* + * Calculate the CU power consumption over a time period, the unit of + * final value (delta) is micro-Watts. Then add it to the event count. + */ + if (new_pwr_acc < prev_pwr_acc) { + delta = max_cu_acc_power + new_pwr_acc; + delta -= prev_pwr_acc; + } else + delta = new_pwr_acc - prev_pwr_acc; + + delta *= cpu_pwr_sample_ratio * 1000; + tdelta = new_ptsc - prev_ptsc; + + do_div(delta, tdelta); + local64_add(delta, &event->count); +} + +static void __pmu_event_start(struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + rdmsrl(MSR_F15H_PTSC, event->hw.ptsc); + rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc); +} + +static void pmu_event_start(struct perf_event *event, int mode) +{ + __pmu_event_start(event); +} + +static void pmu_event_stop(struct perf_event *event, int mode) +{ + struct hw_perf_event *hwc = &event->hw; + + /* Mark event as deactivated and stopped. */ + if (!(hwc->state & PERF_HES_STOPPED)) + hwc->state |= PERF_HES_STOPPED; + + /* Check if software counter update is necessary. */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of an event + * that we are disabling: + */ + event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int pmu_event_add(struct perf_event *event, int mode) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __pmu_event_start(event); + + return 0; +} + +static void pmu_event_del(struct perf_event *event, int flags) +{ + pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK; + + /* Only look at AMD power events. */ + if (event->attr.type != pmu_class.type) + return -ENOENT; + + /* Unsupported modes and filters. */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + /* no sampling */ + event->attr.sample_period) + return -EINVAL; + + if (cfg != AMD_POWER_EVENTSEL_PKG) + return -EINVAL; + + return 0; +} + +static void pmu_event_read(struct perf_event *event) +{ + event_update(event); +} + +static ssize_t +get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &cpu_mask); +} + +static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL); + +static struct attribute *pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group pmu_attr_group = { + .attrs = pmu_attrs, +}; + +/* + * Currently it only supports to report the power of each + * processor/package. + */ +EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01"); + +EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts"); + +/* Convert the count from micro-Watts to milli-Watts. */ +EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3"); + +static struct attribute *events_attr[] = { + EVENT_PTR(power_pkg), + EVENT_PTR(power_pkg_unit), + EVENT_PTR(power_pkg_scale), + NULL, +}; + +static struct attribute_group pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + +PMU_FORMAT_ATTR(event, "config:0-7"); + +static struct attribute *formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group pmu_format_group = { + .name = "format", + .attrs = formats_attr, +}; + +static const struct attribute_group *attr_groups[] = { + &pmu_attr_group, + &pmu_format_group, + &pmu_events_group, + NULL, +}; + +static struct pmu pmu_class = { + .attr_groups = attr_groups, + /* system-wide only */ + .task_ctx_nr = perf_invalid_context, + .event_init = pmu_event_init, + .add = pmu_event_add, + .del = pmu_event_del, + .start = pmu_event_start, + .stop = pmu_event_stop, + .read = pmu_event_read, +}; + +static void power_cpu_exit(int cpu) +{ + int target; + + if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask)) + return; + + /* + * Find a new CPU on the same compute unit, if was set in cpumask + * and still some CPUs on compute unit. Then migrate event and + * context to new CPU. + */ + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + if (target < nr_cpumask_bits) { + cpumask_set_cpu(target, &cpu_mask); + perf_pmu_migrate_context(&pmu_class, cpu, target); + } +} + +static void power_cpu_init(int cpu) +{ + int target; + + /* + * 1) If any CPU is set at cpu_mask in the same compute unit, do + * nothing. + * 2) If no CPU is set at cpu_mask in the same compute unit, + * set current STARTING CPU. + * + * Note: if there is a CPU aside of the new one already in the + * sibling mask, then it is also in cpu_mask. + */ + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + if (target >= nr_cpumask_bits) + cpumask_set_cpu(cpu, &cpu_mask); +} + +static int +power_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + power_cpu_init(cpu); + break; + case CPU_DOWN_PREPARE: + power_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block power_cpu_notifier_nb = { + .notifier_call = power_cpu_notifier, + .priority = CPU_PRI_PERF, +}; + +static const struct x86_cpu_id cpu_match[] = { + { .vendor = X86_VENDOR_AMD, .family = 0x15 }, + {}, +}; + +static int __init amd_power_pmu_init(void) +{ + int cpu, target, ret; + + if (!x86_match_cpu(cpu_match)) + return 0; + + if (!boot_cpu_has(X86_FEATURE_ACC_POWER)) + return -ENODEV; + + cpu_pwr_sample_ratio = cpuid_ecx(0x80000007); + + if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) { + pr_err("Failed to read max compute unit power accumulator MSR\n"); + return -ENODEV; + } + + cpu_notifier_register_begin(); + + /* Choose one online core of each compute unit. */ + for_each_online_cpu(cpu) { + target = cpumask_first(topology_sibling_cpumask(cpu)); + if (!cpumask_test_cpu(target, &cpu_mask)) + cpumask_set_cpu(target, &cpu_mask); + } + + ret = perf_pmu_register(&pmu_class, "power", -1); + if (WARN_ON(ret)) { + pr_warn("AMD Power PMU registration failed\n"); + goto out; + } + + __register_cpu_notifier(&power_cpu_notifier_nb); + + pr_info("AMD Power PMU detected\n"); + +out: + cpu_notifier_register_done(); + + return ret; +} +module_init(amd_power_pmu_init); + +static void __exit amd_power_pmu_exit(void) +{ + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&power_cpu_notifier_nb); + cpu_notifier_register_done(); + + perf_pmu_unregister(&pmu_class); +} +module_exit(amd_power_pmu_exit); + +MODULE_AUTHOR("Huang Rui "); +MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 5e830d0c95c9..002b2eadd600 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1602,8 +1602,7 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) return new; } -ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, - char *page) +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = \ container_of(attr, struct perf_pmu_events_attr, attr); @@ -1615,6 +1614,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, return x86_pmu.events_sysfs_show(page, config); } +EXPORT_SYMBOL_GPL(events_sysfs_show); EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7bb315bec3aa..15588d4c581d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,6 +129,10 @@ struct hw_perf_event { struct { /* itrace */ int itrace_started; }; + struct { /* amd_power */ + u64 pwr_acc; + u64 ptsc; + }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* -- cgit v1.2.3 From cb2252522aaff572f28dc6613307e1e0e62496cd Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Sun, 20 Mar 2016 11:58:21 -0700 Subject: perf/x86/intel/uncore: Remove ev_sel_ext bit support for PCU The ev_sel_ext in PCU_MSR_PMON_CTL is locked on some CPU models, so despite it being documented in the SDM, if we write 1 to that bit then we can get a #GP fault. Which #GP the perf fuzzer happily triggered in Peter Zijlstra's testing. Also, there are no public events which use that bit, so remove ev_sel_ext bit support for PCU. Reported-by: Peter Zijlstra Signed-off-by: Kan Liang Acked-by: Peter Zijlstra Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1458500301-3594-1-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 93f6bd9bf761..ab2bcaaebe38 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -46,7 +46,6 @@ (SNBEP_PMON_CTL_EV_SEL_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PMON_CTL_INVERT | \ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ @@ -148,7 +147,6 @@ /* IVBEP PCU */ #define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ (SNBEP_PMON_CTL_EV_SEL_MASK | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ @@ -258,7 +256,6 @@ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ SNBEP_PMON_CTL_EDGE_DET | \ SNBEP_CBO_PMON_CTL_TID_EN | \ - SNBEP_PMON_CTL_EV_SEL_EXT | \ SNBEP_PMON_CTL_INVERT | \ KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ @@ -472,7 +469,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = { }; static struct attribute *snbep_uncore_pcu_formats_attr[] = { - &format_attr_event_ext.attr, + &format_attr_event.attr, &format_attr_occ_sel.attr, &format_attr_edge.attr, &format_attr_inv.attr, @@ -1313,7 +1310,7 @@ static struct attribute *ivbep_uncore_cbox_formats_attr[] = { }; static struct attribute *ivbep_uncore_pcu_formats_attr[] = { - &format_attr_event_ext.attr, + &format_attr_event.attr, &format_attr_occ_sel.attr, &format_attr_edge.attr, &format_attr_thresh5.attr, -- cgit v1.2.3 From 7b0fd56930399d87241ad2f49d48c315307471ee Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 20 Mar 2016 16:52:18 -0700 Subject: perf/x86/intel/rapl: Add missing Broadwell models Added Broadwell-H and Broadwell-Server. Signed-off-by: Srinivas Pandruvada Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: bp@alien8.de Link: http://lkml.kernel.org/r/1458517938-25308-1-git-send-email-srinivas.pandruvada@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/rapl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index b834a3f55a01..70c93f9b03ac 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -711,6 +711,7 @@ static int __init rapl_pmu_init(void) rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; case 63: /* Haswell-Server */ + case 79: /* Broadwell-Server */ apply_quirk = true; rapl_cntr_mask = RAPL_IDX_SRV; rapl_pmu_events_group.attrs = rapl_events_srv_attr; @@ -718,6 +719,7 @@ static int __init rapl_pmu_init(void) case 60: /* Haswell */ case 69: /* Haswell-Celeron */ case 61: /* Broadwell */ + case 71: /* Broadwell-H */ rapl_cntr_mask = RAPL_IDX_HSW; rapl_pmu_events_group.attrs = rapl_events_hsw_attr; break; -- cgit v1.2.3