244 files changed, 9952 insertions, 883 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 44e09c2641ad..8c40e7cfba2a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14290,8 +14290,10 @@ R:	Mark Rutland <mark.rutland@arm.com>
 R:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
 R:	Jiri Olsa <jolsa@redhat.com>
 R:	Namhyung Kim <namhyung@kernel.org>
+L:	linux-perf-users@vger.kernel.org
 L:	linux-kernel@vger.kernel.org
 S:	Supported
+W:	https://perf.wiki.kernel.org/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
 F:	arch/*/events/*
 F:	arch/*/events/*/*
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 74e255d58d8d..04a8e3db8a54 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -52,6 +52,7 @@ FEATURE_TESTS_BASIC :=                  \
         libpython-version               \
         libslang                        \
         libslang-include-subdir         \
+        libtraceevent                   \
         libcrypto                       \
         libunwind                       \
         pthread-attr-setaffinity-np     \
@@ -239,17 +240,24 @@ ifeq ($(VF),1)
   feature_verbose := 1
 endif
 
-ifeq ($(feature_display),1)
-  $(info )
-  $(info Auto-detecting system features:)
-  $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),))
-  ifneq ($(feature_verbose),1)
+feature_display_entries = $(eval $(feature_display_entries_code))
+define feature_display_entries_code
+  ifeq ($(feature_display),1)
     $(info )
+    $(info Auto-detecting system features:)
+    $(foreach feat,$(FEATURE_DISPLAY),$(call feature_print_status,$(feat),))
+    ifneq ($(feature_verbose),1)
+      $(info )
+    endif
   endif
-endif
 
-ifeq ($(feature_verbose),1)
-  TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS))
-  $(foreach feat,$(TMP),$(call feature_print_status,$(feat),))
-  $(info )
+  ifeq ($(feature_verbose),1)
+    TMP := $(filter-out $(FEATURE_DISPLAY),$(FEATURE_TESTS))
+    $(foreach feat,$(TMP),$(call feature_print_status,$(feat),))
+    $(info )
+  endif
+endef
+
+ifeq ($(FEATURE_DISPLAY_DEFERRED),)
+  $(call feature_display_entries)
 endif
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 3e55edb3ea54..ec203e28407f 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -36,6 +36,7 @@ FILES=                                          \
          test-libpython-version.bin             \
          test-libslang.bin                      \
          test-libslang-include-subdir.bin       \
+         test-libtraceevent.bin                 \
          test-libcrypto.bin                     \
          test-libunwind.bin                     \
          test-libunwind-debug-frame.bin         \
@@ -196,6 +197,9 @@ $(OUTPUT)test-libslang.bin:
 $(OUTPUT)test-libslang-include-subdir.bin:
 	$(BUILD) -lslang
 
+$(OUTPUT)test-libtraceevent.bin:
+	$(BUILD) -ltraceevent
+
 $(OUTPUT)test-libcrypto.bin:
 	$(BUILD) -lcrypto
 
diff --git a/tools/build/feature/test-libtraceevent.c b/tools/build/feature/test-libtraceevent.c
new file mode 100644
index 000000000000..416b11ffd4b4
--- /dev/null
+++ b/tools/build/feature/test-libtraceevent.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <traceevent/trace-seq.h>
+
+int main(void)
+{
+	int rv = 0;
+	struct trace_seq s;
+	trace_seq_init(&s);
+	rv += !(s.state == TRACE_SEQ__GOOD);
+	trace_seq_destroy(&s);
+	return rv;
+}
diff --git a/tools/include/linux/math64.h b/tools/include/linux/math64.h
new file mode 100644
index 000000000000..4ad45d5943dc
--- /dev/null
+++ b/tools/include/linux/math64.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MATH64_H
+#define _LINUX_MATH64_H
+
+#include <linux/types.h>
+
+#ifdef __x86_64__
+static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
+{
+	u64 q;
+
+	asm ("mulq %2; divq %3" : "=a" (q)
+				: "a" (a), "rm" (b), "rm" (c)
+				: "rdx");
+
+	return q;
+}
+#define mul_u64_u64_div64 mul_u64_u64_div64
+#endif
+
+#ifdef __SIZEOF_INT128__
+static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
+{
+	return (u64)(((unsigned __int128)a * b) >> shift);
+}
+
+#else
+
+#ifdef __i386__
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	u32 high, low;
+
+	asm ("mull %[b]" : "=a" (low), "=d" (high)
+			 : [a] "a" (a), [b] "rm" (b) );
+
+	return low | ((u64)high) << 32;
+}
+#else
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	return (u64)a * b;
+}
+#endif
+
+static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
+{
+	u32 ah, al;
+	u64 ret;
+
+	al = a;
+	ah = a >> 32;
+
+	ret = mul_u32_u32(al, b) >> shift;
+	if (ah)
+		ret += mul_u32_u32(ah, b) << (32 - shift);
+
+	return ret;
+}
+
+#endif	/* __SIZEOF_INT128__ */
+
+#ifndef mul_u64_u64_div64
+static inline u64 mul_u64_u64_div64(u64 a, u64 b, u64 c)
+{
+	u64 quot, rem;
+
+	quot = a / c;
+	rem = a % c;
+
+	return quot * b + (rem * b) / c;
+}
+#endif
+
+#endif /* _LINUX_MATH64_H */
diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h
index e9c5a215837d..6e14a533ab4e 100644
--- a/tools/include/linux/types.h
+++ b/tools/include/linux/types.h
@@ -61,6 +61,9 @@ typedef __u32 __bitwise __be32;
 typedef __u64 __bitwise __le64;
 typedef __u64 __bitwise __be64;
 
+typedef __u16 __bitwise __sum16;
+typedef __u32 __bitwise __wsum;
+
 typedef struct {
 	int counter;
 } atomic_t;
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index ad15e40d7f5d..14332f4cf816 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -38,6 +38,21 @@ enum perf_type_id {
 };
 
 /*
+ * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
+ * PERF_TYPE_HARDWARE:			0xEEEEEEEE000000AA
+ *					AA: hardware event ID
+ *					EEEEEEEE: PMU type ID
+ * PERF_TYPE_HW_CACHE:			0xEEEEEEEE00DDCCBB
+ *					BB: hardware cache ID
+ *					CC: hardware cache op ID
+ *					DD: hardware cache op result ID
+ *					EEEEEEEE: PMU type ID
+ * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied.
+ */
+#define PERF_PMU_TYPE_SHIFT		32
+#define PERF_HW_EVENT_MASK		0xffffffff
+
+/*
  * Generalized performance event event_id types, used by the
  * attr.event_id parameter of the sys_perf_event_open()
  * syscall:
diff --git a/tools/lib/perf/Documentation/libperf.txt b/tools/lib/perf/Documentation/libperf.txt
index 0c74c30ed23a..63ae5e0195ce 100644
--- a/tools/lib/perf/Documentation/libperf.txt
+++ b/tools/lib/perf/Documentation/libperf.txt
@@ -136,6 +136,9 @@ SYNOPSIS
                        struct perf_thread_map *threads);
   void perf_evsel__close(struct perf_evsel *evsel);
   void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu);
+  int perf_evsel__mmap(struct perf_evsel *evsel, int pages);
+  void perf_evsel__munmap(struct perf_evsel *evsel);
+  void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread);
   int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
                        struct perf_counts_values *count);
   int perf_evsel__enable(struct perf_evsel *evsel);
diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
index 4dc06289f4c7..bd8c2f19ef74 100644
--- a/tools/lib/perf/evsel.c
+++ b/tools/lib/perf/evsel.c
@@ -11,10 +11,12 @@
 #include <stdlib.h>
 #include <internal/xyarray.h>
 #include <internal/cpumap.h>
+#include <internal/mmap.h>
 #include <internal/threadmap.h>
 #include <internal/lib.h>
 #include <linux/string.h>
 #include <sys/ioctl.h>
+#include <sys/mman.h>
 
 void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr)
 {
@@ -38,6 +40,7 @@ void perf_evsel__delete(struct perf_evsel *evsel)
 }
 
 #define FD(e, x, y) (*(int *) xyarray__entry(e->fd, x, y))
+#define MMAP(e, x, y) (e->mmap ? ((struct perf_mmap *) xyarray__entry(e->mmap, x, y)) : NULL)
 
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
@@ -55,6 +58,13 @@ int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 	return evsel->fd != NULL ? 0 : -ENOMEM;
 }
 
+static int perf_evsel__alloc_mmap(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+	evsel->mmap = xyarray__new(ncpus, nthreads, sizeof(struct perf_mmap));
+
+	return evsel->mmap != NULL ? 0 : -ENOMEM;
+}
+
 static int
 sys_perf_event_open(struct perf_event_attr *attr,
 		    pid_t pid, int cpu, int group_fd,
@@ -156,6 +166,72 @@ void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu)
 	perf_evsel__close_fd_cpu(evsel, cpu);
 }
 
+void perf_evsel__munmap(struct perf_evsel *evsel)
+{
+	int cpu, thread;
+
+	if (evsel->fd == NULL || evsel->mmap == NULL)
+		return;
+
+	for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) {
+		for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
+			int fd = FD(evsel, cpu, thread);
+			struct perf_mmap *map = MMAP(evsel, cpu, thread);
+
+			if (fd < 0)
+				continue;
+
+			perf_mmap__munmap(map);
+		}
+	}
+
+	xyarray__delete(evsel->mmap);
+	evsel->mmap = NULL;
+}
+
+int perf_evsel__mmap(struct perf_evsel *evsel, int pages)
+{
+	int ret, cpu, thread;
+	struct perf_mmap_param mp = {
+		.prot = PROT_READ | PROT_WRITE,
+		.mask = (pages * page_size) - 1,
+	};
+
+	if (evsel->fd == NULL || evsel->mmap)
+		return -EINVAL;
+
+	if (perf_evsel__alloc_mmap(evsel, xyarray__max_x(evsel->fd), xyarray__max_y(evsel->fd)) < 0)
+		return -ENOMEM;
+
+	for (cpu = 0; cpu < xyarray__max_x(evsel->fd); cpu++) {
+		for (thread = 0; thread < xyarray__max_y(evsel->fd); thread++) {
+			int fd = FD(evsel, cpu, thread);
+			struct perf_mmap *map = MMAP(evsel, cpu, thread);
+
+			if (fd < 0)
+				continue;
+
+			perf_mmap__init(map, NULL, false, NULL);
+
+			ret = perf_mmap__mmap(map, &mp, fd, cpu);
+			if (ret) {
+				perf_evsel__munmap(evsel);
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread)
+{
+	if (FD(evsel, cpu, thread) < 0 || MMAP(evsel, cpu, thread) == NULL)
+		return NULL;
+
+	return MMAP(evsel, cpu, thread)->base;
+}
+
 int perf_evsel__read_size(struct perf_evsel *evsel)
 {
 	u64 read_format = evsel->attr.read_format;
@@ -191,6 +267,10 @@ int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
 	if (FD(evsel, cpu, thread) < 0)
 		return -EINVAL;
 
+	if (MMAP(evsel, cpu, thread) &&
+	    !perf_mmap__read_self(MMAP(evsel, cpu, thread), count))
+		return 0;
+
 	if (readn(FD(evsel, cpu, thread), count->values, size) <= 0)
 		return -errno;
 
diff --git a/tools/lib/perf/include/internal/evsel.h b/tools/lib/perf/include/internal/evsel.h
index 1ffd083b235e..1c067d088bc6 100644
--- a/tools/lib/perf/include/internal/evsel.h
+++ b/tools/lib/perf/include/internal/evsel.h
@@ -41,6 +41,7 @@ struct perf_evsel {
 	struct perf_cpu_map	*own_cpus;
 	struct perf_thread_map	*threads;
 	struct xyarray		*fd;
+	struct xyarray		*mmap;
 	struct xyarray		*sample_id;
 	u64			*id;
 	u32			 ids;
diff --git a/tools/lib/perf/include/internal/mmap.h b/tools/lib/perf/include/internal/mmap.h
index be7556e0a2b2..5e3422f40ed5 100644
--- a/tools/lib/perf/include/internal/mmap.h
+++ b/tools/lib/perf/include/internal/mmap.h
@@ -11,6 +11,7 @@
 #define PERF_SAMPLE_MAX_SIZE (1 << 16)
 
 struct perf_mmap;
+struct perf_counts_values;
 
 typedef void (*libperf_unmap_cb_t)(struct perf_mmap *map);
 
@@ -52,4 +53,6 @@ void perf_mmap__put(struct perf_mmap *map);
 
 u64 perf_mmap__read_head(struct perf_mmap *map);
 
+int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count);
+
 #endif /* __LIBPERF_INTERNAL_MMAP_H */
diff --git a/tools/lib/perf/include/internal/tests.h b/tools/lib/perf/include/internal/tests.h
index 2093e8868a67..29425c2dabe1 100644
--- a/tools/lib/perf/include/internal/tests.h
+++ b/tools/lib/perf/include/internal/tests.h
@@ -3,11 +3,32 @@
 #define __LIBPERF_INTERNAL_TESTS_H
 
 #include <stdio.h>
+#include <unistd.h>
 
 int tests_failed;
+int tests_verbose;
+
+static inline int get_verbose(char **argv, int argc)
+{
+	int c;
+	int verbose = 0;
+
+	while ((c = getopt(argc, argv, "v")) != -1) {
+		switch (c)
+		{
+		case 'v':
+			verbose = 1;
+			break;
+		default:
+			break;
+		}
+	}
+	return verbose;
+}
 
 #define __T_START					\
 do {							\
+	tests_verbose = get_verbose(argv, argc);	\
 	fprintf(stdout, "- running %s...", __FILE__);	\
 	fflush(NULL);					\
 	tests_failed = 0;				\
@@ -30,4 +51,15 @@ do {
 	}                                                                        \
 } while (0)
 
+#define __T_VERBOSE(...)						\
+do {									\
+	if (tests_verbose) {						\
+		if (tests_verbose == 1) {				\
+			fputc('\n', stderr);				\
+			tests_verbose++;				\
+		}							\
+		fprintf(stderr, ##__VA_ARGS__);				\
+	}								\
+} while (0)
+
 #endif /* __LIBPERF_INTERNAL_TESTS_H */
diff --git a/tools/lib/perf/include/internal/xyarray.h b/tools/lib/perf/include/internal/xyarray.h
index 51e35d6c8ec4..f10af3da7b21 100644
--- a/tools/lib/perf/include/internal/xyarray.h
+++ b/tools/lib/perf/include/internal/xyarray.h
@@ -18,11 +18,18 @@ struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size);
 void xyarray__delete(struct xyarray *xy);
 void xyarray__reset(struct xyarray *xy);
 
-static inline void *xyarray__entry(struct xyarray *xy, int x, int y)
+static inline void *__xyarray__entry(struct xyarray *xy, int x, int y)
 {
 	return &xy->contents[x * xy->row_size + y * xy->entry_size];
 }
 
+static inline void *xyarray__entry(struct xyarray *xy, size_t x, size_t y)
+{
+	if (x >= xy->max_x || y >= xy->max_y)
+		return NULL;
+	return __xyarray__entry(xy, x, y);
+}
+
 static inline int xyarray__max_y(struct xyarray *xy)
 {
 	return xy->max_y;
diff --git a/tools/lib/perf/include/perf/bpf_perf.h b/tools/lib/perf/include/perf/bpf_perf.h
new file mode 100644
index 000000000000..e7cf6ba7b674
--- /dev/null
+++ b/tools/lib/perf/include/perf/bpf_perf.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __LIBPERF_BPF_PERF_H
+#define __LIBPERF_BPF_PERF_H
+
+#include <linux/types.h>  /* for __u32 */
+
+/*
+ * bpf_perf uses a hashmap, the attr_map, to track all the leader programs.
+ * The hashmap is pinned in bpffs. flock() on this file is used to ensure
+ * no concurrent access to the attr_map.  The key of attr_map is struct
+ * perf_event_attr, and the value is struct perf_event_attr_map_entry.
+ *
+ * struct perf_event_attr_map_entry contains two __u32 IDs, bpf_link of the
+ * leader prog, and the diff_map. Each perf-stat session holds a reference
+ * to the bpf_link to make sure the leader prog is attached to sched_switch
+ * tracepoint.
+ *
+ * Since the hashmap only contains IDs of the bpf_link and diff_map, it
+ * does not hold any references to the leader program. Once all perf-stat
+ * sessions of these events exit, the leader prog, its maps, and the
+ * perf_events will be freed.
+ */
+struct perf_event_attr_map_entry {
+	__u32 link_id;
+	__u32 diff_map_id;
+};
+
+/* default attr_map name */
+#define BPF_PERF_DEFAULT_ATTR_MAP_PATH "perf_attr_map"
+
+#endif /* __LIBPERF_BPF_PERF_H */
diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
index d82054225fcc..4d0c02ba3f7d 100644
--- a/tools/lib/perf/include/perf/event.h
+++ b/tools/lib/perf/include/perf/event.h
@@ -8,6 +8,8 @@
 #include <linux/bpf.h>
 #include <sys/types.h> /* pid_t */
 
+#define event_contains(obj, mem) ((obj).header.size > offsetof(typeof(obj), mem))
+
 struct perf_record_mmap {
 	struct perf_event_header header;
 	__u32			 pid, tid;
@@ -346,8 +348,9 @@ struct perf_record_time_conv {
 	__u64			 time_zero;
 	__u64			 time_cycles;
 	__u64			 time_mask;
-	bool			 cap_user_time_zero;
-	bool			 cap_user_time_short;
+	__u8			 cap_user_time_zero;
+	__u8			 cap_user_time_short;
+	__u8			 reserved[6];	/* For alignment */
 };
 
 struct perf_record_header_feature {
diff --git a/tools/lib/perf/include/perf/evsel.h b/tools/lib/perf/include/perf/evsel.h
index c82ec39a4ad0..60eae25076d3 100644
--- a/tools/lib/perf/include/perf/evsel.h
+++ b/tools/lib/perf/include/perf/evsel.h
@@ -27,6 +27,9 @@ LIBPERF_API int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *
 				 struct perf_thread_map *threads);
 LIBPERF_API void perf_evsel__close(struct perf_evsel *evsel);
 LIBPERF_API void perf_evsel__close_cpu(struct perf_evsel *evsel, int cpu);
+LIBPERF_API int perf_evsel__mmap(struct perf_evsel *evsel, int pages);
+LIBPERF_API void perf_evsel__munmap(struct perf_evsel *evsel);
+LIBPERF_API void *perf_evsel__mmap_base(struct perf_evsel *evsel, int cpu, int thread);
 LIBPERF_API int perf_evsel__read(struct perf_evsel *evsel, int cpu, int thread,
 				 struct perf_counts_values *count);
 LIBPERF_API int perf_evsel__enable(struct perf_evsel *evsel);
diff --git a/tools/lib/perf/libperf.map b/tools/lib/perf/libperf.map
index 7be1af8a546c..c0c7ceb11060 100644
--- a/tools/lib/perf/libperf.map
+++ b/tools/lib/perf/libperf.map
@@ -23,6 +23,9 @@ LIBPERF_0.0.1 {
 		perf_evsel__disable;
 		perf_evsel__open;
 		perf_evsel__close;
+		perf_evsel__mmap;
+		perf_evsel__munmap;
+		perf_evsel__mmap_base;
 		perf_evsel__read;
 		perf_evsel__cpus;
 		perf_evsel__threads;
diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c
index 79d5ed6c38cc..c89dfa5f67b3 100644
--- a/tools/lib/perf/mmap.c
+++ b/tools/lib/perf/mmap.c
@@ -8,9 +8,11 @@
 #include <linux/perf_event.h>
 #include <perf/mmap.h>
 #include <perf/event.h>
+#include <perf/evsel.h>
 #include <internal/mmap.h>
 #include <internal/lib.h>
 #include <linux/kernel.h>
+#include <linux/math64.h>
 #include "internal.h"
 
 void perf_mmap__init(struct perf_mmap *map, struct perf_mmap *prev,
@@ -273,3 +275,89 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map)
 
 	return event;
 }
+
+#if defined(__i386__) || defined(__x86_64__)
+static u64 read_perf_counter(unsigned int counter)
+{
+	unsigned int low, high;
+
+	asm volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
+
+	return low | ((u64)high) << 32;
+}
+
+static u64 read_timestamp(void)
+{
+	unsigned int low, high;
+
+	asm volatile("rdtsc" : "=a" (low), "=d" (high));
+
+	return low | ((u64)high) << 32;
+}
+#else
+static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; }
+static u64 read_timestamp(void) { return 0; }
+#endif
+
+int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count)
+{
+	struct perf_event_mmap_page *pc = map->base;
+	u32 seq, idx, time_mult = 0, time_shift = 0;
+	u64 cnt, cyc = 0, time_offset = 0, time_cycles = 0, time_mask = ~0ULL;
+
+	if (!pc || !pc->cap_user_rdpmc)
+		return -1;
+
+	do {
+		seq = READ_ONCE(pc->lock);
+		barrier();
+
+		count->ena = READ_ONCE(pc->time_enabled);
+		count->run = READ_ONCE(pc->time_running);
+
+		if (pc->cap_user_time && count->ena != count->run) {
+			cyc = read_timestamp();
+			time_mult = READ_ONCE(pc->time_mult);
+			time_shift = READ_ONCE(pc->time_shift);
+			time_offset = READ_ONCE(pc->time_offset);
+
+			if (pc->cap_user_time_short) {
+				time_cycles = READ_ONCE(pc->time_cycles);
+				time_mask = READ_ONCE(pc->time_mask);
+			}
+		}
+
+		idx = READ_ONCE(pc->index);
+		cnt = READ_ONCE(pc->offset);
+		if (pc->cap_user_rdpmc && idx) {
+			s64 evcnt = read_perf_counter(idx - 1);
+			u16 width = READ_ONCE(pc->pmc_width);
+
+			evcnt <<= 64 - width;
+			evcnt >>= 64 - width;
+			cnt += evcnt;
+		} else
+			return -1;
+
+		barrier();
+	} while (READ_ONCE(pc->lock) != seq);
+
+	if (count->ena != count->run) {
+		u64 delta;
+
+		/* Adjust for cap_usr_time_short, a nop if not */
+		cyc = time_cycles + ((cyc - time_cycles) & time_mask);
+
+		delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
+
+		count->ena += delta;
+		if (idx)
+			count->run += delta;
+
+		cnt = mul_u64_u64_div64(cnt, count->ena, count->run);
+	}
+
+	count->val = cnt;
+
+	return 0;
+}
diff --git a/tools/lib/perf/tests/Makefile b/tools/lib/perf/tests/Makefile
index 96841775feaf..b536cc9a26dd 100644
--- a/tools/lib/perf/tests/Makefile
+++ b/tools/lib/perf/tests/Makefile
@@ -5,6 +5,8 @@ TESTS = test-cpumap test-threadmap test-evlist test-evsel
 TESTS_SO := $(addsuffix -so,$(TESTS))
 TESTS_A  := $(addsuffix -a,$(TESTS))
 
+TEST_ARGS := $(if $(V),-v)
+
 # Set compile option CFLAGS
 ifdef EXTRA_CFLAGS
   CFLAGS := $(EXTRA_CFLAGS)
@@ -28,9 +30,9 @@ all: $(TESTS_A) $(TESTS_SO)
 
 run:
 	@echo "running static:"
-	@for i in $(TESTS_A); do ./$$i; done
+	@for i in $(TESTS_A); do ./$$i $(TEST_ARGS); done
 	@echo "running dynamic:"
-	@for i in $(TESTS_SO); do LD_LIBRARY_PATH=../ ./$$i; done
+	@for i in $(TESTS_SO); do LD_LIBRARY_PATH=../ ./$$i $(TEST_ARGS); done
 
 clean:
 	$(call QUIET_CLEAN, tests)$(RM) $(TESTS_A) $(TESTS_SO)
diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c
index 0ad82d7a2a51..288b5feaefe2 100644
--- a/tools/lib/perf/tests/test-evsel.c
+++ b/tools/lib/perf/tests/test-evsel.c
@@ -120,6 +120,70 @@ static int test_stat_thread_enable(void)
 	return 0;
 }
 
+static int test_stat_user_read(int event)
+{
+	struct perf_counts_values counts = { .val = 0 };
+	struct perf_thread_map *threads;
+	struct perf_evsel *evsel;
+	struct perf_event_mmap_page *pc;
+	struct perf_event_attr attr = {
+		.type	= PERF_TYPE_HARDWARE,
+		.config	= event,
+	};
+	int err, i;
+
+	threads = perf_thread_map__new_dummy();
+	__T("failed to create threads", threads);
+
+	perf_thread_map__set_pid(threads, 0, 0);
+
+	evsel = perf_evsel__new(&attr);
+	__T("failed to create evsel", evsel);
+
+	err = perf_evsel__open(evsel, NULL, threads);
+	__T("failed to open evsel", err == 0);
+
+	err = perf_evsel__mmap(evsel, 0);
+	__T("failed to mmap evsel", err == 0);
+
+	pc = perf_evsel__mmap_base(evsel, 0, 0);
+
+#if defined(__i386__) || defined(__x86_64__)
+	__T("userspace counter access not supported", pc->cap_user_rdpmc);
+	__T("userspace counter access not enabled", pc->index);
+	__T("userspace counter width not set", pc->pmc_width >= 32);
+#endif
+
+	perf_evsel__read(evsel, 0, 0, &counts);
+	__T("failed to read value for evsel", counts.val != 0);
+
+	for (i = 0; i < 5; i++) {
+		volatile int count = 0x10000 << i;
+		__u64 start, end, last = 0;
+
+		__T_VERBOSE("\tloop = %u, ", count);
+
+		perf_evsel__read(evsel, 0, 0, &counts);
+		start = counts.val;
+
+		while (count--) ;
+
+		perf_evsel__read(evsel, 0, 0, &counts);
+		end = counts.val;
+
+		__T("invalid counter data", (end - start) > last);
+		last = end - start;
+		__T_VERBOSE("count = %llu\n", end - start);
+	}
+
+	perf_evsel__munmap(evsel);
+	perf_evsel__close(evsel);
+	perf_evsel__delete(evsel);
+
+	perf_thread_map__put(threads);
+	return 0;
+}
+
 int main(int argc, char **argv)
 {
 	__T_START;
@@ -129,6 +193,8 @@ int main(int argc, char **argv)
 	test_stat_cpu();
 	test_stat_thread();
 	test_stat_thread_enable();
+	test_stat_user_read(PERF_COUNT_HW_INSTRUCTIONS);
+	test_stat_user_read(PERF_COUNT_HW_CPU_CYCLES);
 
 	__T_END;
 	return tests_failed == 0 ? 0 : -1;
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index f3f84781fd74..e555e9729758 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -20,6 +20,7 @@ perf.data.old
 output.svg
 perf-archive
 perf-with-kcore
+perf-iostat
 tags
 TAGS
 cscope*
diff --git a/tools/perf/Documentation/intel-hybrid.txt b/tools/perf/Documentation/intel-hybrid.txt
new file mode 100644
index 000000000000..07f0aa3bf682
--- /dev/null
+++ b/tools/perf/Documentation/intel-hybrid.txt
@@ -0,0 +1,214 @@
+Intel hybrid support
+--------------------
+Support for Intel hybrid events within perf tools.
+
+For some Intel platforms, such as AlderLake, which is hybrid platform and
+it consists of atom cpu and core cpu. Each cpu has dedicated event list.
+Part of events are available on core cpu, part of events are available
+on atom cpu and even part of events are available on both.
+
+Kernel exports two new cpu pmus via sysfs:
+/sys/devices/cpu_core
+/sys/devices/cpu_atom
+
+The 'cpus' files are created under the directories. For example,
+
+cat /sys/devices/cpu_core/cpus
+0-15
+
+cat /sys/devices/cpu_atom/cpus
+16-23
+
+It indicates cpu0-cpu15 are core cpus and cpu16-cpu23 are atom cpus.
+
+Quickstart
+
+List hybrid event
+-----------------
+
+As before, use perf-list to list the symbolic event.
+
+perf list
+
+inst_retired.any
+	[Fixed Counter: Counts the number of instructions retired. Unit: cpu_atom]
+inst_retired.any
+	[Number of instructions retired. Fixed Counter - architectural event. Unit: cpu_core]
+
+The 'Unit: xxx' is added to brief description to indicate which pmu
+the event is belong to. Same event name but with different pmu can
+be supported.
+
+Enable hybrid event with a specific pmu
+---------------------------------------
+
+To enable a core only event or atom only event, following syntax is supported:
+
+	cpu_core/<event name>/
+or
+	cpu_atom/<event name>/
+
+For example, count the 'cycles' event on core cpus.
+
+	perf stat -e cpu_core/cycles/
+
+Create two events for one hardware event automatically
+------------------------------------------------------
+
+When creating one event and the event is available on both atom and core,
+two events are created automatically. One is for atom, the other is for
+core. Most of hardware events and cache events are available on both
+cpu_core and cpu_atom.
+
+For hardware events, they have pre-defined configs (e.g. 0 for cycles).
+But on hybrid platform, kernel needs to know where the event comes from
+(from atom or from core). The original perf event type PERF_TYPE_HARDWARE
+can't carry pmu information. So now this type is extended to be PMU aware
+type. The PMU type ID is stored at attr.config[63:32].
+
+PMU type ID is retrieved from sysfs.
+/sys/devices/cpu_atom/type
+/sys/devices/cpu_core/type
+
+The new attr.config layout for PERF_TYPE_HARDWARE:
+
+PERF_TYPE_HARDWARE:                 0xEEEEEEEE000000AA
+                                    AA: hardware event ID
+                                    EEEEEEEE: PMU type ID
+
+Cache event is similar. The type PERF_TYPE_HW_CACHE is extended to be
+PMU aware type. The PMU type ID is stored at attr.config[63:32].
+
+The new attr.config layout for PERF_TYPE_HW_CACHE:
+
+PERF_TYPE_HW_CACHE:                 0xEEEEEEEE00DDCCBB
+                                    BB: hardware cache ID
+                                    CC: hardware cache op ID
+                                    DD: hardware cache op result ID
+                                    EEEEEEEE: PMU type ID
+
+When enabling a hardware event without specified pmu, such as,
+perf stat -e cycles -a (use system-wide in this example), two events
+are created automatically.
+
+  ------------------------------------------------------------
+  perf_event_attr:
+    size                             120
+    config                           0x400000000
+    sample_type                      IDENTIFIER
+    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+    disabled                         1
+    inherit                          1
+    exclude_guest                    1
+  ------------------------------------------------------------
+
+and
+
+  ------------------------------------------------------------
+  perf_event_attr:
+    size                             120
+    config                           0x800000000
+    sample_type                      IDENTIFIER
+    read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+    disabled                         1
+    inherit                          1
+    exclude_guest                    1
+  ------------------------------------------------------------
+
+type 0 is PERF_TYPE_HARDWARE.
+0x4 in 0x400000000 indicates it's cpu_core pmu.
+0x8 in 0x800000000 indicates it's cpu_atom pmu (atom pmu type id is random).
+
+The kernel creates 'cycles' (0x400000000) on cpu0-cpu15 (core cpus),
+and create 'cycles' (0x800000000) on cpu16-cpu23 (atom cpus).
+
+For perf-stat result, it displays two events:
+
+ Performance counter stats for 'system wide':
+
+           6,744,979      cpu_core/cycles/
+           1,965,552      cpu_atom/cycles/
+
+The first 'cycles' is core event, the second 'cycles' is atom event.
+
+Thread mode example:
+--------------------
+
+perf-stat reports the scaled counts for hybrid event and with a percentage
+displayed. The percentage is the event's running time/enabling time.
+
+One example, 'triad_loop' runs on cpu16 (atom core), while we can see the
+scaled value for core cycles is 160,444,092 and the percentage is 0.47%.
+
+perf stat -e cycles -- taskset -c 16 ./triad_loop
+
+As previous, two events are created.
+
+------------------------------------------------------------
+perf_event_attr:
+  size                             120
+  config                           0x400000000
+  sample_type                      IDENTIFIER
+  read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+  disabled                         1
+  inherit                          1
+  enable_on_exec                   1
+  exclude_guest                    1
+------------------------------------------------------------
+
+and
+
+------------------------------------------------------------
+perf_event_attr:
+  size                             120
+  config                           0x800000000
+  sample_type                      IDENTIFIER
+  read_format                      TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING
+  disabled                         1
+  inherit                          1
+  enable_on_exec                   1
+  exclude_guest                    1
+------------------------------------------------------------
+
+ Performance counter stats for 'taskset -c 16 ./triad_loop':
+
+       233,066,666      cpu_core/cycles/                                              (0.43%)
+       604,097,080      cpu_atom/cycles/                                              (99.57%)
+
+perf-record:
+------------
+
+If there is no '-e' specified in perf record, on hybrid platform,
+it creates two default 'cycles' and adds them to event list. One
+is for core, the other is for atom.
+
+perf-stat:
+----------
+
+If there is no '-e' specified in perf stat, on hybrid platform,
+besides of software events, following events are created and
+added to event list in order.
+
+cpu_core/cycles/,
+cpu_atom/cycles/,
+cpu_core/instructions/,
+cpu_atom/instructions/,
+cpu_core/branches/,
+cpu_atom/branches/,
+cpu_core/branch-misses/,
+cpu_atom/branch-misses/
+
+Of course, both perf-stat and perf-record support to enable
+hybrid event with a specific pmu.
+
+e.g.
+perf stat -e cpu_core/cycles/
+perf stat -e cpu_atom/cycles/
+perf stat -e cpu_core/r1a/
+perf stat -e cpu_atom/L1-icache-loads/
+perf stat -e cpu_core/cycles/,cpu_atom/instructions/
+perf stat -e '{cpu_core/cycles/,cpu_core/instructions/}'
+
+But '{cpu_core/cycles/,cpu_atom/instructions/}' will return
+warning and disable grouping, because the pmus in group are
+not matched (cpu_core vs. cpu_atom).
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index 1b5042f134a8..80c1be5d566c 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -124,6 +124,13 @@ OPTIONS
 --group::
 	Show event group information together
 
+--demangle::
+	Demangle symbol names to human readable form. It's enabled by default,
+	disable with --no-demangle.
+
+--demangle-kernel::
+	Demangle kernel symbol names to human readable form (for C++ kernels).
+
 --percent-type::
 	Set annotation percent type from following choices:
 	  global-period, local-period, global-hits, local-hits
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index bb167e32a1d7..cd8ce6e8ec12 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -57,7 +57,7 @@ OPTIONS
 -u::
 --update=::
 	Update specified file of the cache. Note that this doesn't remove
-	older entires since those may be still needed for annotating old
+	older entries since those may be still needed for annotating old
 	(or remote) perf.data. Only if there is already a cache which has
 	exactly same build-id, that is replaced by new one. It can be used
 	to update kallsyms and kernel dso to vmlinux in order to support
diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 153bde14bbe0..b0872c801866 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -123,6 +123,7 @@ Given a $HOME/.perfconfig like this:
 		queue-size = 0
 		children = true
 		group = true
+		skip-empty = true
 
 	[llvm]
 		dump-obj = true
@@ -393,6 +394,12 @@ annotate.*::
 
 		This option works with tui, stdio2 browsers.
 
+	annotate.demangle::
+		Demangle symbol names to human readable form. Default is 'true'.
+
+	annotate.demangle_kernel::
+		Demangle kernel symbol names to human readable form. Default is 'true'.
+
 hist.*::
 	hist.percentage::
 		This option control the way to calculate overhead of filtered entries -
@@ -525,6 +532,10 @@ report.*::
 		     0.07%   0.00%  noploop  ld-2.15.so         [.] strcmp
 		     0.03%   0.00%  noploop  [kernel.kallsyms]  [k] timerqueue_del
 
+	report.skip-empty::
+		This option can change default stat behavior with empty results.
+		If it's set true, 'perf report --stat' will not show 0 stats.
+
 top.*::
 	top.children::
 		Same as 'report.children'. So if it is enabled, the output of 'top'
diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt
index 726b9bc9e1a7..417bf17e265c 100644
--- a/tools/perf/Documentation/perf-data.txt
+++ b/tools/perf/Documentation/perf-data.txt
@@ -17,7 +17,7 @@ Data file related processing.
 COMMANDS
 --------
 convert::
-	Converts perf data file into another format (only CTF [1] format is support by now).
+	Converts perf data file into another format.
 	It's possible to set data-convert debug variable to get debug messages from conversion,
 	like:
 	  perf --debug data-convert data convert ...
@@ -27,6 +27,9 @@ OPTIONS for 'convert'
 --to-ctf::
 	Triggers the CTF conversion, specify the path of CTF data directory.
 
+--to-json::
+	Triggers JSON conversion. Specify the JSON filename to output.
+
 --tod::
 	Convert time to wall clock time.
 
diff --git a/tools/perf/Documentation/perf-iostat.txt b/tools/perf/Documentation/perf-iostat.txt
new file mode 100644
index 000000000000..165176944031
--- /dev/null
+++ b/tools/perf/Documentation/perf-iostat.txt
@@ -0,0 +1,88 @@
+perf-iostat(1)
+===============
+
+NAME
+----
+perf-iostat - Show I/O performance metrics
+
+SYNOPSIS
+--------
+[verse]
+'perf iostat' list
+'perf iostat' <ports> -- <command> [<options>]
+
+DESCRIPTION
+-----------
+Mode is intended to provide four I/O performance metrics per each PCIe root port:
+
+- Inbound Read   - I/O devices below root port read from the host memory, in MB
+
+- Inbound Write  - I/O devices below root port write to the host memory, in MB
+
+- Outbound Read  - CPU reads from I/O devices below root port, in MB
+
+- Outbound Write - CPU writes to I/O devices below root port, in MB
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+list::
+	List all PCIe root ports.
+
+<ports>::
+	Select the root ports for monitoring. Comma-separated list is supported.
+
+EXAMPLES
+--------
+
+1. List all PCIe root ports (example for 2-S platform):
+
+   $ perf iostat list
+   S0-uncore_iio_0<0000:00>
+   S1-uncore_iio_0<0000:80>
+   S0-uncore_iio_1<0000:17>
+   S1-uncore_iio_1<0000:85>
+   S0-uncore_iio_2<0000:3a>
+   S1-uncore_iio_2<0000:ae>
+   S0-uncore_iio_3<0000:5d>
+   S1-uncore_iio_3<0000:d7>
+
+2. Collect metrics for all PCIe root ports:
+
+   $ perf iostat -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 215.974 s, 1.7 GB/s
+
+    Performance counter stats for 'system wide':
+
+      port             Inbound Read(MB)    Inbound Write(MB)    Outbound Read(MB)   Outbound Write(MB)
+   0000:00                    1                    0                    2                    3
+   0000:80                    0                    0                    0                    0
+   0000:17               352552                   43                    0                   21
+   0000:85                    0                    0                    0                    0
+   0000:3a                    3                    0                    0                    0
+   0000:ae                    0                    0                    0                    0
+   0000:5d                    0                    0                    0                    0
+   0000:d7                    0                    0                    0                    0
+
+3. Collect metrics for comma-separated list of PCIe root ports:
+
+   $ perf iostat 0000:17,0:3a -- dd if=/dev/zero of=/dev/nvme0n1 bs=1M oflag=direct
+   357708+0 records in
+   357707+0 records out
+   375083606016 bytes (375 GB, 349 GiB) copied, 197.08 s, 1.9 GB/s
+
+    Performance counter stats for 'system wide':
+
+      port             Inbound Read(MB)    Inbound Write(MB)    Outbound Read(MB)   Outbound Write(MB)
+   0000:17               358559                   44                    0                   22
+   0000:3a                    3                    2                    0                    0
+
+        197.081983474 seconds time elapsed
+
+SEE ALSO
+--------
+linkperf:perf-stat[1]
+\ No newline at end of file
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index f3161c9673e9..d71bac847936 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -695,6 +695,7 @@ measurements:
  wait -n ${perf_pid}
  exit $?
 
+include::intel-hybrid.txt[]
 
 SEE ALSO
 --------
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index f546b5e9db05..24efc0583c93 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -112,6 +112,8 @@ OPTIONS
 	- ins_lat: Instruction latency in core cycles. This is the global instruction
 	  latency
 	- local_ins_lat: Local instruction latency version
+	- p_stage_cyc: On powerpc, this presents the number of cycles spent in a
+	  pipeline stage. And currently supported only on powerpc.
 
 	By default, comm, dso and symbol keys are used.
 	(i.e. --sort comm,dso,symbol)
@@ -224,6 +226,9 @@ OPTIONS
 --dump-raw-trace::
         Dump raw trace in ASCII.
 
+--disable-order::
+	Disable raw trace ordering.
+
 -g::
 --call-graph=<print_type,threshold[,print_limit],order,sort_key[,branch],value>::
         Display call chains using type, min percent threshold, print limit,
@@ -472,7 +477,7 @@ OPTIONS
 	but probably we'll make the default not to show the switch-on/off events
         on the --group mode and if there is only one event besides the off/on ones,
 	go straight to the histogram browser, just like 'perf report' with no events
-	explicitely specified does.
+	explicitly specified does.
 
 --itrace::
 	Options for decoding instruction tracing data. The options are:
@@ -566,6 +571,9 @@ include::itrace.txt[]
 			    sampled cycles
 	'Avg Cycles'      - block average sampled cycles
 
+--skip-empty::
+	Do not print 0 results in the --stat output.
+
 include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 08a1714494f8..45c2467e4eb2 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -93,6 +93,19 @@ report::
 
         1.102235068 seconds time elapsed
 
+--bpf-counters::
+	Use BPF programs to aggregate readings from perf_events.  This
+	allows multiple perf-stat sessions that are counting the same metric (cycles,
+	instructions, etc.) to share hardware counters.
+	To use BPF programs on common events by default, use
+	"perf config stat.bpf-counter-events=<list_of_events>".
+
+--bpf-attr-map::
+	With option "--bpf-counters", different perf-stat sessions share
+	information about shared BPF programs and maps via a pinned hashmap.
+	Use "--bpf-attr-map" to specify the path of this pinned hashmap.
+	The default path is /sys/fs/bpf/perf_attr_map.
+
 ifdef::HAVE_LIBPFM[]
 --pfm-events events::
 Select a PMU event using libpfm4 syntax (see http://perfmon2.sf.net)
@@ -142,7 +155,10 @@ Do not aggregate counts across all monitored CPUs.
 
 -n::
 --null::
-        null run - don't start any counters
+null run - Don't start any counters.
+
+This can be useful to measure just elapsed wall-clock time - or to assess the
+raw overhead of perf stat itself, without running any counters.
 
 -v::
 --verbose::
@@ -468,6 +484,15 @@ convenient for post processing.
 --summary::
 Print summary for interval mode (-I).
 
+--no-csv-summary::
+Don't print 'summary' at the first column for CVS summary output.
+This option must be used with -x and --summary.
+
+This option can be enabled in perf config by setting the variable
+'stat.no-csv-summary'.
+
+$ perf config stat.no-csv-summary=true
+
 EXAMPLES
 --------
 
@@ -527,6 +552,8 @@ The fields are in this order:
 
 Additional metrics may be printed with all earlier fields being empty.
 
+include::intel-hybrid.txt[]
+
 SEE ALSO
 --------
 linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index ee2024691d46..bba5ffb05463 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -317,7 +317,7 @@ Default is to monitor all CPUS.
 	but probably we'll make the default not to show the switch-on/off events
         on the --group mode and if there is only one event besides the off/on ones,
 	go straight to the histogram browser, just like 'perf top' with no events
-	explicitely specified does.
+	explicitly specified does.
 
 --stitch-lbr::
 	Show callgraph with stitched LBRs, which may have more complete
diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt
index c130a3c46a90..9c330cdfa973 100644
--- a/tools/perf/Documentation/perf.txt
+++ b/tools/perf/Documentation/perf.txt
@@ -76,3 +76,15 @@ SEE ALSO
 linkperf:perf-stat[1], linkperf:perf-top[1],
 linkperf:perf-record[1], linkperf:perf-report[1],
 linkperf:perf-list[1]
+
+linkperf:perf-annotate[1],linkperf:perf-archive[1],
+linkperf:perf-bench[1], linkperf:perf-buildid-cache[1],
+linkperf:perf-buildid-list[1], linkperf:perf-c2c[1],
+linkperf:perf-config[1], linkperf:perf-data[1], linkperf:perf-diff[1],
+linkperf:perf-evlist[1], linkperf:perf-ftrace[1],
+linkperf:perf-help[1], linkperf:perf-inject[1],
+linkperf:perf-intel-pt[1], linkperf:perf-kallsyms[1],
+linkperf:perf-kmem[1], linkperf:perf-kvm[1], linkperf:perf-lock[1],
+linkperf:perf-mem[1], linkperf:perf-probe[1], linkperf:perf-sched[1],
+linkperf:perf-script[1], linkperf:perf-test[1],
+linkperf:perf-trace[1], linkperf:perf-version[1]
diff --git a/tools/perf/Documentation/topdown.txt b/tools/perf/Documentation/topdown.txt
index 10f07f9455b8..c6302df4cf29 100644
--- a/tools/perf/Documentation/topdown.txt
+++ b/tools/perf/Documentation/topdown.txt
@@ -72,6 +72,7 @@ For example, the perf_event_attr structure can be initialized with
 The Fixed counter 3 must be the leader of the group.
 
 #include <linux/perf_event.h>
+#include <sys/mman.h>
 #include <sys/syscall.h>
 #include <unistd.h>
 
@@ -95,6 +96,11 @@ int slots_fd = perf_event_open(&slots, 0, -1, -1, 0);
 if (slots_fd < 0)
 	... error ...
 
+/* Memory mapping the fd permits _rdpmc calls from userspace */
+void *slots_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, slots_fd, 0);
+if (!slot_p)
+	.... error ...
+
 /*
  * Open metrics event file descriptor for current task.
  * Set slots event as the leader of the group.
@@ -110,6 +116,14 @@ int metrics_fd = perf_event_open(&metrics, 0, -1, slots_fd, 0);
 if (metrics_fd < 0)
 	... error ...
 
+/* Memory mapping the fd permits _rdpmc calls from userspace */
+void *metrics_p = mmap(0, getpagesize(), PROT_READ, MAP_SHARED, metrics_fd, 0);
+if (!metrics_p)
+	... error ...
+
+Note: the file descriptors returned by the perf_event_open calls must be memory
+mapped to permit calls to the _rdpmd instruction. Permission may also be granted
+by writing the /sys/devices/cpu/rdpmc sysfs node.
 
 The RDPMC instruction (or _rdpmc compiler intrinsic) can now be used
 to read slots and the topdown metrics at different points of the program:
@@ -141,6 +155,10 @@ as the parallelism and overlap in the CPU program execution will
 cause too much measurement inaccuracy. For example instrumenting
 individual basic blocks is definitely too fine grained.
 
+_rdpmc calls should not be mixed with reading the metrics and slots counters
+through system calls, as the kernel will reset these counters after each system
+call.
+
 Decoding metrics values
 =======================
 
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index b8fc7d972be9..f3fe360a35c6 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -100,7 +100,10 @@ clean:
 # make -C tools/perf -f tests/make
 #
 build-test:
-	@$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg out
+	@$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg make_static make_with_gtk2 out
+
+build-test-tarball:
+	@$(MAKE) -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory out
 
 #
 # All other targets get passed through:
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index d8e59d31399a..0d6619064a83 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -32,7 +32,7 @@ ifneq ($(NO_SYSCALL_TABLE),1)
       NO_SYSCALL_TABLE := 0
     endif
   else
-    ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390))
+    ifeq ($(SRCARCH),$(filter $(SRCARCH),powerpc arm64 s390 mips))
       NO_SYSCALL_TABLE := 0
     endif
   endif
@@ -87,6 +87,13 @@ ifeq ($(ARCH),s390)
   CFLAGS += -fPIC -I$(OUTPUT)arch/s390/include/generated
 endif
 
+ifeq ($(ARCH),mips)
+  NO_PERF_REGS := 0
+  CFLAGS += -I$(OUTPUT)arch/mips/include/generated
+  CFLAGS += -I../../arch/mips/include/uapi -I../../arch/mips/include/generated/uapi
+  LIBUNWIND_LIBS = -lunwind -lunwind-mips
+endif
+
 ifeq ($(NO_PERF_REGS),0)
   $(call detected,CONFIG_PERF_REGS)
 endif
@@ -292,6 +299,9 @@ ifneq ($(TCMALLOC),)
 endif
 
 ifeq ($(FEATURES_DUMP),)
+# We will display at the end of this Makefile.config, using $(call feature_display_entries)
+# As we may retry some feature detection here, see the disassembler-four-args case, for instance
+  FEATURE_DISPLAY_DEFERRED := 1
 include $(srctree)/tools/build/Makefile.feature
 else
 include $(FEATURES_DUMP)
@@ -1072,6 +1082,15 @@ ifdef LIBPFM4
   endif
 endif
 
+ifdef LIBTRACEEVENT_DYNAMIC
+  $(call feature_check,libtraceevent)
+  ifeq ($(feature-libtraceevent), 1)
+    EXTLIBS += -ltraceevent
+  else
+    dummy := $(error Error: No libtraceevent devel library found, please install libtraceevent-devel);
+  endif
+endif
+
 # Among the variables below, these:
 #   perfexecdir
 #   perf_include_dir
@@ -1208,3 +1227,13 @@ $(call detected_var,LIBDIR)
 $(call detected_var,GTK_CFLAGS)
 $(call detected_var,PERL_EMBED_CCOPTS)
 $(call detected_var,PYTHON_EMBED_CCOPTS)
+
+# re-generate FEATURE-DUMP as we may have called feature_check, found out
+# extra libraries to add to LDFLAGS of some other test and then redo those
+# tests, see the block about libbfd, disassembler-four-args, for instance.
+$(shell rm -f $(FEATURE_DUMP_FILENAME))
+$(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME)))
+
+ifeq ($(feature_display),1)
+  $(call feature_display_entries)
+endif
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index f6e609673de2..e47f04e5b51e 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -128,6 +128,8 @@ include ../scripts/utilities.mak
 #
 # Define BUILD_BPF_SKEL to enable BPF skeletons
 #
+# Define LIBTRACEEVENT_DYNAMIC to enable libtraceevent dynamic linking
+#
 
 # As per kernel Makefile, avoid funny character set dependencies
 unexport LC_ALL
@@ -283,6 +285,7 @@ SCRIPT_SH =
 
 SCRIPT_SH += perf-archive.sh
 SCRIPT_SH += perf-with-kcore.sh
+SCRIPT_SH += perf-iostat.sh
 
 grep-libs = $(filter -l%,$(1))
 strip-libs = $(filter-out -l%,$(1))
@@ -309,7 +312,6 @@ endif
 
 LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
 export LIBTRACEEVENT
-
 LIBTRACEEVENT_DYNAMIC_LIST = $(PLUGINS_PATH)libtraceevent-dynamic-list
 
 #
@@ -374,12 +376,15 @@ endif
 
 export PERL_PATH
 
-PERFLIBS = $(LIBAPI) $(LIBTRACEEVENT) $(LIBSUBCMD) $(LIBPERF)
+PERFLIBS = $(LIBAPI) $(LIBSUBCMD) $(LIBPERF)
 ifndef NO_LIBBPF
   ifndef LIBBPF_DYNAMIC
     PERFLIBS += $(LIBBPF)
   endif
 endif
+ifndef LIBTRACEEVENT_DYNAMIC
+  PERFLIBS += $(LIBTRACEEVENT)
+endif
 
 # We choose to avoid "if .. else if .. else .. endif endif"
 # because maintaining the nesting to match is a pain.  If
@@ -948,6 +953,8 @@ endif
 		$(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 	$(call QUIET_INSTALL, perf-with-kcore) \
 		$(INSTALL) $(OUTPUT)perf-with-kcore -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(call QUIET_INSTALL, perf-iostat) \
+		$(INSTALL) $(OUTPUT)perf-iostat -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 ifndef NO_LIBAUDIT
 	$(call QUIET_INSTALL, strace/groups) \
 		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(STRACE_GROUPS_INSTDIR_SQ)'; \
@@ -1007,6 +1014,7 @@ python-clean:
 SKEL_OUT := $(abspath $(OUTPUT)util/bpf_skel)
 SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
 SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h
+SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
 
 ifdef BUILD_BPF_SKEL
 BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool
@@ -1021,7 +1029,7 @@ $(BPFTOOL): | $(SKEL_TMP_OUT)
 		OUTPUT=$(SKEL_TMP_OUT)/ bootstrap
 
 $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT)
-	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf $(BPF_INCLUDE) \
+	$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \
 	  -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@
 
 $(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL)
@@ -1041,7 +1049,7 @@ bpf-skel-clean:
 	$(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS)
 
 clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean
-	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS)
+	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(OUTPUT)perf-iostat $(LANG_BINDINGS)
 	$(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
 	$(Q)$(RM) $(OUTPUT).config-detected
 	$(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index c25c878fd06c..d942f118d32c 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -67,6 +67,7 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr,
 	char path[PATH_MAX];
 	int err = -EINVAL;
 	u32 val;
+	u64 contextid;
 
 	ptr = container_of(itr, struct cs_etm_recording, itr);
 	cs_etm_pmu = ptr->cs_etm_pmu;
@@ -86,25 +87,59 @@ static int cs_etm_set_context_id(struct auxtrace_record *itr,
 		goto out;
 	}
 
+	/* User has configured for PID tracing, respects it. */
+	contextid = evsel->core.attr.config &
+			(BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_CTXTID2));
+
 	/*
-	 * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID tracing
-	 * is supported:
-	 *  0b00000 Context ID tracing is not supported.
-	 *  0b00100 Maximum of 32-bit Context ID size.
-	 *  All other values are reserved.
+	 * If user doesn't configure the contextid format, parse PMU format and
+	 * enable PID tracing according to the "contextid" format bits:
+	 *
+	 *   If bit ETM_OPT_CTXTID is set, trace CONTEXTIDR_EL1;
+	 *   If bit ETM_OPT_CTXTID2 is set, trace CONTEXTIDR_EL2.
 	 */
-	val = BMVAL(val, 5, 9);
-	if (!val || val != 0x4) {
-		err = -EINVAL;
-		goto out;
+	if (!contextid)
+		contextid = perf_pmu__format_bits(&cs_etm_pmu->format,
+						  "contextid");
+
+	if (contextid & BIT(ETM_OPT_CTXTID)) {
+		/*
+		 * TRCIDR2.CIDSIZE, bit [9-5], indicates whether contextID
+		 * tracing is supported:
+		 *  0b00000 Context ID tracing is not supported.
+		 *  0b00100 Maximum of 32-bit Context ID size.
+		 *  All other values are reserved.
+		 */
+		val = BMVAL(val, 5, 9);
+		if (!val || val != 0x4) {
+			pr_err("%s: CONTEXTIDR_EL1 isn't supported\n",
+			       CORESIGHT_ETM_PMU_NAME);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (contextid & BIT(ETM_OPT_CTXTID2)) {
+		/*
+		 * TRCIDR2.VMIDOPT[30:29] != 0 and
+		 * TRCIDR2.VMIDSIZE[14:10] == 0b00100 (32bit virtual contextid)
+		 * We can't support CONTEXTIDR in VMID if the size of the
+		 * virtual context id is < 32bit.
+		 * Any value of VMIDSIZE >= 4 (i.e, > 32bit) is fine for us.
+		 */
+		if (!BMVAL(val, 29, 30) || BMVAL(val, 10, 14) < 4) {
+			pr_err("%s: CONTEXTIDR_EL2 isn't supported\n",
+			       CORESIGHT_ETM_PMU_NAME);
+			err = -EINVAL;
+			goto out;
+		}
 	}
 
 	/* All good, let the kernel know */
-	evsel->core.attr.config |= (1 << ETM_OPT_CTXTID);
+	evsel->core.attr.config |= contextid;
 	err = 0;
 
 out:
-
 	return err;
 }
 
@@ -173,17 +208,17 @@ static int cs_etm_set_option(struct auxtrace_record *itr,
 		    !cpu_map__has(online_cpus, i))
 			continue;
 
-		if (option & ETM_SET_OPT_CTXTID) {
+		if (option & BIT(ETM_OPT_CTXTID)) {
 			err = cs_etm_set_context_id(itr, evsel, i);
 			if (err)
 				goto out;
 		}
-		if (option & ETM_SET_OPT_TS) {
+		if (option & BIT(ETM_OPT_TS)) {
 			err = cs_etm_set_timestamp(itr, evsel, i);
 			if (err)
 				goto out;
 		}
-		if (option & ~(ETM_SET_OPT_MASK))
+		if (option & ~(BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS)))
 			/* Nothing else is currently supported */
 			goto out;
 	}
@@ -343,7 +378,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
 			opts->auxtrace_mmap_pages = roundup_pow_of_two(sz);
 		}
 
-		/* Snapshost size can't be bigger than the auxtrace area */
+		/* Snapshot size can't be bigger than the auxtrace area */
 		if (opts->auxtrace_snapshot_size >
 				opts->auxtrace_mmap_pages * (size_t)page_size) {
 			pr_err("Snapshot size %zu must not be greater than AUX area tracing mmap size %zu\n",
@@ -410,7 +445,7 @@ static int cs_etm_recording_options(struct auxtrace_record *itr,
 		evsel__set_sample_bit(cs_etm_evsel, CPU);
 
 		err = cs_etm_set_option(itr, cs_etm_evsel,
-					ETM_SET_OPT_CTXTID | ETM_SET_OPT_TS);
+					BIT(ETM_OPT_CTXTID) | BIT(ETM_OPT_TS));
 		if (err)
 			goto out;
 	}
@@ -489,7 +524,9 @@ static u64 cs_etmv4_get_config(struct auxtrace_record *itr)
 		config |= BIT(ETM4_CFG_BIT_TS);
 	if (config_opts & BIT(ETM_OPT_RETSTK))
 		config |= BIT(ETM4_CFG_BIT_RETSTK);
-
+	if (config_opts & BIT(ETM_OPT_CTXTID2))
+		config |= BIT(ETM4_CFG_BIT_VMID) |
+			  BIT(ETM4_CFG_BIT_VMID_OPT);
 	return config;
 }
 
@@ -576,7 +613,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
 				struct auxtrace_record *itr,
 				struct perf_record_auxtrace_info *info)
 {
-	u32 increment;
+	u32 increment, nr_trc_params;
 	u64 magic;
 	struct cs_etm_recording *ptr =
 			container_of(itr, struct cs_etm_recording, itr);
@@ -611,6 +648,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
 
 		/* How much space was used */
 		increment = CS_ETMV4_PRIV_MAX;
+		nr_trc_params = CS_ETMV4_PRIV_MAX - CS_ETMV4_TRCCONFIGR;
 	} else {
 		magic = __perf_cs_etmv3_magic;
 		/* Get configuration register */
@@ -628,11 +666,13 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
 
 		/* How much space was used */
 		increment = CS_ETM_PRIV_MAX;
+		nr_trc_params = CS_ETM_PRIV_MAX - CS_ETM_ETMCR;
 	}
 
 	/* Build generic header portion */
 	info->priv[*offset + CS_ETM_MAGIC] = magic;
 	info->priv[*offset + CS_ETM_CPU] = cpu;
+	info->priv[*offset + CS_ETM_NR_TRC_PARAMS] = nr_trc_params;
 	/* Where the next CPU entry should start from */
 	*offset += increment;
 }
@@ -678,7 +718,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr,
 
 	/* First fill out the session header */
 	info->type = PERF_AUXTRACE_CS_ETM;
-	info->priv[CS_HEADER_VERSION_0] = 0;
+	info->priv[CS_HEADER_VERSION] = CS_HEADER_CURRENT_VERSION;
 	info->priv[CS_PMU_TYPE_CPUS] = type << 32;
 	info->priv[CS_PMU_TYPE_CPUS] |= nr_cpu;
 	info->priv[CS_ETM_SNAPSHOT] = ptr->snapshot_mode;
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index ead2f2275eee..9fcb4e68add9 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -2,6 +2,7 @@ perf-y += header.o
 perf-y += machine.o
 perf-y += perf_regs.o
 perf-y += tsc.o
+perf-y += pmu.o
 perf-y += kvm-stat.o
 perf-$(CONFIG_DWARF)     += dwarf-regs.o
 perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c
index 50376b9062c1..2303256b7d05 100644
--- a/tools/perf/arch/arm64/util/kvm-stat.c
+++ b/tools/perf/arch/arm64/util/kvm-stat.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
 #include <memory.h>
-#include "../../util/evsel.h"
-#include "../../util/kvm-stat.h"
+#include "../../../util/evsel.h"
+#include "../../../util/kvm-stat.h"
 #include "arm64_exception_types.h"
 #include "debug.h"
 
diff --git a/tools/perf/arch/arm64/util/machine.c b/tools/perf/arch/arm64/util/machine.c
index 40c5e0b5bda8..7e7714290a87 100644
--- a/tools/perf/arch/arm64/util/machine.c
+++ b/tools/perf/arch/arm64/util/machine.c
@@ -6,11 +6,11 @@
 #include "debug.h"
 #include "symbol.h"
 
-/* On arm64, kernel text segment start at high memory address,
+/* On arm64, kernel text segment starts at high memory address,
  * for example 0xffff 0000 8xxx xxxx. Modules start at a low memory
- * address, like 0xffff 0000 00ax xxxx. When only samll amount of
+ * address, like 0xffff 0000 00ax xxxx. When only small amount of
  * memory is used by modules, gap between end of module's text segment
- * and start of kernel text segment may be reach 2G.
+ * and start of kernel text segment may reach 2G.
  * Therefore do not fill this gap and do not assign it to the kernel dso map.
  */
 
diff --git a/tools/perf/arch/arm64/util/perf_regs.c b/tools/perf/arch/arm64/util/perf_regs.c
index 2518cde18b34..476b037eea1c 100644
--- a/tools/perf/arch/arm64/util/perf_regs.c
+++ b/tools/perf/arch/arm64/util/perf_regs.c
@@ -108,7 +108,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 		/* [sp], [sp, NUM] or [sp,NUM] */
 		new_len = 7;	/* + ( % s p ) NULL */
 
-		/* If the arugment is [sp], need to fill offset '0' */
+		/* If the argument is [sp], need to fill offset '0' */
 		if (rm[2].rm_so == -1)
 			new_len += 1;
 		else
diff --git a/tools/perf/arch/arm64/util/pmu.c b/tools/perf/arch/arm64/util/pmu.c
new file mode 100644
index 000000000000..2234fbd0a912
--- /dev/null
+++ b/tools/perf/arch/arm64/util/pmu.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../../../util/cpumap.h"
+#include "../../../util/pmu.h"
+
+struct pmu_events_map *pmu_events_map__find(void)
+{
+	struct perf_pmu *pmu = NULL;
+
+	while ((pmu = perf_pmu__scan(pmu))) {
+		if (!is_pmu_core(pmu->name))
+			continue;
+
+		/*
+		 * The cpumap should cover all CPUs. Otherwise, some CPUs may
+		 * not support some events or have different event IDs.
+		 */
+		if (pmu->cpus->nr != cpu__max_cpu())
+			return NULL;
+
+		return perf_pmu__find_map(pmu);
+	}
+
+	return NULL;
+}
diff --git a/tools/perf/arch/arm64/util/unwind-libunwind.c b/tools/perf/arch/arm64/util/unwind-libunwind.c
index 1495a9523a23..5aecf88e3de6 100644
--- a/tools/perf/arch/arm64/util/unwind-libunwind.c
+++ b/tools/perf/arch/arm64/util/unwind-libunwind.c
@@ -4,9 +4,9 @@
 #ifndef REMOTE_UNWIND_LIBUNWIND
 #include <libunwind.h>
 #include "perf_regs.h"
-#include "../../util/unwind.h"
+#include "../../../util/unwind.h"
 #endif
-#include "../../util/debug.h"
+#include "../../../util/debug.h"
 
 int LIBUNWIND__ARCH_REG_ID(int regnum)
 {
diff --git a/tools/perf/arch/mips/Makefile b/tools/perf/arch/mips/Makefile
new file mode 100644
index 000000000000..8bc09072e3d6
--- /dev/null
+++ b/tools/perf/arch/mips/Makefile
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+endif
+
+# Syscall table generation for perf
+out    := $(OUTPUT)arch/mips/include/generated/asm
+header := $(out)/syscalls_n64.c
+sysprf := $(srctree)/tools/perf/arch/mips/entry/syscalls
+sysdef := $(sysprf)/syscall_n64.tbl
+systbl := $(sysprf)/mksyscalltbl
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+
+$(header): $(sysdef) $(systbl)
+	$(Q)$(SHELL) '$(systbl)' $(sysdef) > $@
+
+clean::
+	$(call QUIET_CLEAN, mips) $(RM) $(header)
+
+archheaders: $(header)
diff --git a/tools/perf/arch/mips/entry/syscalls/mksyscalltbl b/tools/perf/arch/mips/entry/syscalls/mksyscalltbl
new file mode 100644
index 000000000000..fb1f49451af6
--- /dev/null
+++ b/tools/perf/arch/mips/entry/syscalls/mksyscalltbl
@@ -0,0 +1,32 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generate system call table for perf. Derived from
+# s390 script.
+#
+# Author(s):  Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+# Changed by: Tiezhu Yang <yangtiezhu@loongson.cn>
+
+SYSCALL_TBL=$1
+
+if ! test -r $SYSCALL_TBL; then
+	echo "Could not read input file" >&2
+	exit 1
+fi
+
+create_table()
+{
+	local max_nr nr abi sc discard
+
+	echo 'static const char *syscalltbl_mips_n64[] = {'
+	while read nr abi sc discard; do
+		printf '\t[%d] = "%s",\n' $nr $sc
+		max_nr=$nr
+	done
+	echo '};'
+	echo "#define SYSCALLTBL_MIPS_N64_MAX_ID $max_nr"
+}
+
+grep -E "^[[:digit:]]+[[:space:]]+(n64)" $SYSCALL_TBL	\
+	|sort -k1 -n					\
+	|create_table
diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
new file mode 100644
index 000000000000..91649690b52f
--- /dev/null
+++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl
@@ -0,0 +1,358 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# system call numbers and entry vectors for mips
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The <abi> is always "n64" for this file.
+#
+0	n64	read				sys_read
+1	n64	write				sys_write
+2	n64	open				sys_open
+3	n64	close				sys_close
+4	n64	stat				sys_newstat
+5	n64	fstat				sys_newfstat
+6	n64	lstat				sys_newlstat
+7	n64	poll				sys_poll
+8	n64	lseek				sys_lseek
+9	n64	mmap				sys_mips_mmap
+10	n64	mprotect			sys_mprotect
+11	n64	munmap				sys_munmap
+12	n64	brk				sys_brk
+13	n64	rt_sigaction			sys_rt_sigaction
+14	n64	rt_sigprocmask			sys_rt_sigprocmask
+15	n64	ioctl				sys_ioctl
+16	n64	pread64				sys_pread64
+17	n64	pwrite64			sys_pwrite64
+18	n64	readv				sys_readv
+19	n64	writev				sys_writev
+20	n64	access				sys_access
+21	n64	pipe				sysm_pipe
+22	n64	_newselect			sys_select
+23	n64	sched_yield			sys_sched_yield
+24	n64	mremap				sys_mremap
+25	n64	msync				sys_msync
+26	n64	mincore				sys_mincore
+27	n64	madvise				sys_madvise
+28	n64	shmget				sys_shmget
+29	n64	shmat				sys_shmat
+30	n64	shmctl				sys_old_shmctl
+31	n64	dup				sys_dup
+32	n64	dup2				sys_dup2
+33	n64	pause				sys_pause
+34	n64	nanosleep			sys_nanosleep
+35	n64	getitimer			sys_getitimer
+36	n64	setitimer			sys_setitimer
+37	n64	alarm				sys_alarm
+38	n64	getpid				sys_getpid
+39	n64	sendfile			sys_sendfile64
+40	n64	socket				sys_socket
+41	n64	connect				sys_connect
+42	n64	accept				sys_accept
+43	n64	sendto				sys_sendto
+44	n64	recvfrom			sys_recvfrom
+45	n64	sendmsg				sys_sendmsg
+46	n64	recvmsg				sys_recvmsg
+47	n64	shutdown			sys_shutdown
+48	n64	bind				sys_bind
+49	n64	listen				sys_listen
+50	n64	getsockname			sys_getsockname
+51	n64	getpeername			sys_getpeername
+52	n64	socketpair			sys_socketpair
+53	n64	setsockopt			sys_setsockopt
+54	n64	getsockopt			sys_getsockopt
+55	n64	clone				__sys_clone
+56	n64	fork				__sys_fork
+57	n64	execve				sys_execve
+58	n64	exit				sys_exit
+59	n64	wait4				sys_wait4
+60	n64	kill				sys_kill
+61	n64	uname				sys_newuname
+62	n64	semget				sys_semget
+63	n64	semop				sys_semop
+64	n64	semctl				sys_old_semctl
+65	n64	shmdt				sys_shmdt
+66	n64	msgget				sys_msgget
+67	n64	msgsnd				sys_msgsnd
+68	n64	msgrcv				sys_msgrcv
+69	n64	msgctl				sys_old_msgctl
+70	n64	fcntl				sys_fcntl
+71	n64	flock				sys_flock
+72	n64	fsync				sys_fsync
+73	n64	fdatasync			sys_fdatasync
+74	n64	truncate			sys_truncate
+75	n64	ftruncate			sys_ftruncate
+76	n64	getdents			sys_getdents
+77	n64	getcwd				sys_getcwd
+78	n64	chdir				sys_chdir
+79	n64	fchdir				sys_fchdir
+80	n64	rename				sys_rename
+81	n64	mkdir				sys_mkdir
+82	n64	rmdir				sys_rmdir
+83	n64	creat				sys_creat
+84	n64	link				sys_link
+85	n64	unlink				sys_unlink
+86	n64	symlink				sys_symlink
+87	n64	readlink			sys_readlink
+88	n64	chmod				sys_chmod
+89	n64	fchmod				sys_fchmod
+90	n64	chown				sys_chown
+91	n64	fchown				sys_fchown
+92	n64	lchown				sys_lchown
+93	n64	umask				sys_umask
+94	n64	gettimeofday			sys_gettimeofday
+95	n64	getrlimit			sys_getrlimit
+96	n64	getrusage			sys_getrusage
+97	n64	sysinfo				sys_sysinfo
+98	n64	times				sys_times
+99	n64	ptrace				sys_ptrace
+100	n64	getuid				sys_getuid
+101	n64	syslog				sys_syslog
+102	n64	getgid				sys_getgid
+103	n64	setuid				sys_setuid
+104	n64	setgid				sys_setgid
+105	n64	geteuid				sys_geteuid
+106	n64	getegid				sys_getegid
+107	n64	setpgid				sys_setpgid
+108	n64	getppid				sys_getppid
+109	n64	getpgrp				sys_getpgrp
+110	n64	setsid				sys_setsid
+111	n64	setreuid			sys_setreuid
+112	n64	setregid			sys_setregid
+113	n64	getgroups			sys_getgroups
+114	n64	setgroups			sys_setgroups
+115	n64	setresuid			sys_setresuid
+116	n64	getresuid			sys_getresuid
+117	n64	setresgid			sys_setresgid
+118	n64	getresgid			sys_getresgid
+119	n64	getpgid				sys_getpgid
+120	n64	setfsuid			sys_setfsuid
+121	n64	setfsgid			sys_setfsgid
+122	n64	getsid				sys_getsid
+123	n64	capget				sys_capget
+124	n64	capset				sys_capset
+125	n64	rt_sigpending			sys_rt_sigpending
+126	n64	rt_sigtimedwait			sys_rt_sigtimedwait
+127	n64	rt_sigqueueinfo			sys_rt_sigqueueinfo
+128	n64	rt_sigsuspend			sys_rt_sigsuspend
+129	n64	sigaltstack			sys_sigaltstack
+130	n64	utime				sys_utime
+131	n64	mknod				sys_mknod
+132	n64	personality			sys_personality
+133	n64	ustat				sys_ustat
+134	n64	statfs				sys_statfs
+135	n64	fstatfs				sys_fstatfs
+136	n64	sysfs				sys_sysfs
+137	n64	getpriority			sys_getpriority
+138	n64	setpriority			sys_setpriority
+139	n64	sched_setparam			sys_sched_setparam
+140	n64	sched_getparam			sys_sched_getparam
+141	n64	sched_setscheduler		sys_sched_setscheduler
+142	n64	sched_getscheduler		sys_sched_getscheduler
+143	n64	sched_get_priority_max		sys_sched_get_priority_max
+144	n64	sched_get_priority_min		sys_sched_get_priority_min
+145	n64	sched_rr_get_interval		sys_sched_rr_get_interval
+146	n64	mlock				sys_mlock
+147	n64	munlock				sys_munlock
+148	n64	mlockall			sys_mlockall
+149	n64	munlockall			sys_munlockall
+150	n64	vhangup				sys_vhangup
+151	n64	pivot_root			sys_pivot_root
+152	n64	_sysctl				sys_ni_syscall
+153	n64	prctl				sys_prctl
+154	n64	adjtimex			sys_adjtimex
+155	n64	setrlimit			sys_setrlimit
+156	n64	chroot				sys_chroot
+157	n64	sync				sys_sync
+158	n64	acct				sys_acct
+159	n64	settimeofday			sys_settimeofday
+160	n64	mount				sys_mount
+161	n64	umount2				sys_umount
+162	n64	swapon				sys_swapon
+163	n64	swapoff				sys_swapoff
+164	n64	reboot				sys_reboot
+165	n64	sethostname			sys_sethostname
+166	n64	setdomainname			sys_setdomainname
+167	n64	create_module			sys_ni_syscall
+168	n64	init_module			sys_init_module
+169	n64	delete_module			sys_delete_module
+170	n64	get_kernel_syms			sys_ni_syscall
+171	n64	query_module			sys_ni_syscall
+172	n64	quotactl			sys_quotactl
+173	n64	nfsservctl			sys_ni_syscall
+174	n64	getpmsg				sys_ni_syscall
+175	n64	putpmsg				sys_ni_syscall
+176	n64	afs_syscall			sys_ni_syscall
+# 177 reserved for security
+177	n64	reserved177			sys_ni_syscall
+178	n64	gettid				sys_gettid
+179	n64	readahead			sys_readahead
+180	n64	setxattr			sys_setxattr
+181	n64	lsetxattr			sys_lsetxattr
+182	n64	fsetxattr			sys_fsetxattr
+183	n64	getxattr			sys_getxattr
+184	n64	lgetxattr			sys_lgetxattr
+185	n64	fgetxattr			sys_fgetxattr
+186	n64	listxattr			sys_listxattr
+187	n64	llistxattr			sys_llistxattr
+188	n64	flistxattr			sys_flistxattr
+189	n64	removexattr			sys_removexattr
+190	n64	lremovexattr			sys_lremovexattr
+191	n64	fremovexattr			sys_fremovexattr
+192	n64	tkill				sys_tkill
+193	n64	reserved193			sys_ni_syscall
+194	n64	futex				sys_futex
+195	n64	sched_setaffinity		sys_sched_setaffinity
+196	n64	sched_getaffinity		sys_sched_getaffinity
+197	n64	cacheflush			sys_cacheflush
+198	n64	cachectl			sys_cachectl
+199	n64	sysmips				__sys_sysmips
+200	n64	io_setup			sys_io_setup
+201	n64	io_destroy			sys_io_destroy
+202	n64	io_getevents			sys_io_getevents
+203	n64	io_submit			sys_io_submit
+204	n64	io_cancel			sys_io_cancel
+205	n64	exit_group			sys_exit_group
+206	n64	lookup_dcookie			sys_lookup_dcookie
+207	n64	epoll_create			sys_epoll_create
+208	n64	epoll_ctl			sys_epoll_ctl
+209	n64	epoll_wait			sys_epoll_wait
+210	n64	remap_file_pages		sys_remap_file_pages
+211	n64	rt_sigreturn			sys_rt_sigreturn
+212	n64	set_tid_address			sys_set_tid_address
+213	n64	restart_syscall			sys_restart_syscall
+214	n64	semtimedop			sys_semtimedop
+215	n64	fadvise64			sys_fadvise64_64
+216	n64	timer_create			sys_timer_create
+217	n64	timer_settime			sys_timer_settime
+218	n64	timer_gettime			sys_timer_gettime
+219	n64	timer_getoverrun		sys_timer_getoverrun
+220	n64	timer_delete			sys_timer_delete
+221	n64	clock_settime			sys_clock_settime
+222	n64	clock_gettime			sys_clock_gettime
+223	n64	clock_getres			sys_clock_getres
+224	n64	clock_nanosleep			sys_clock_nanosleep
+225	n64	tgkill				sys_tgkill
+226	n64	utimes				sys_utimes
+227	n64	mbind				sys_mbind
+228	n64	get_mempolicy			sys_get_mempolicy
+229	n64	set_mempolicy			sys_set_mempolicy
+230	n64	mq_open				sys_mq_open
+231	n64	mq_unlink			sys_mq_unlink
+232	n64	mq_timedsend			sys_mq_timedsend
+233	n64	mq_timedreceive			sys_mq_timedreceive
+234	n64	mq_notify			sys_mq_notify
+235	n64	mq_getsetattr			sys_mq_getsetattr
+236	n64	vserver				sys_ni_syscall
+237	n64	waitid				sys_waitid
+# 238 was sys_setaltroot
+239	n64	add_key				sys_add_key
+240	n64	request_key			sys_request_key
+241	n64	keyctl				sys_keyctl
+242	n64	set_thread_area			sys_set_thread_area
+243	n64	inotify_init			sys_inotify_init
+244	n64	inotify_add_watch		sys_inotify_add_watch
+245	n64	inotify_rm_watch		sys_inotify_rm_watch
+246	n64	migrate_pages			sys_migrate_pages
+247	n64	openat				sys_openat
+248	n64	mkdirat				sys_mkdirat
+249	n64	mknodat				sys_mknodat
+250	n64	fchownat			sys_fchownat
+251	n64	futimesat			sys_futimesat
+252	n64	newfstatat			sys_newfstatat
+253	n64	unlinkat			sys_unlinkat
+254	n64	renameat			sys_renameat
+255	n64	linkat				sys_linkat
+256	n64	symlinkat			sys_symlinkat
+257	n64	readlinkat			sys_readlinkat
+258	n64	fchmodat			sys_fchmodat
+259	n64	faccessat			sys_faccessat
+260	n64	pselect6			sys_pselect6
+261	n64	ppoll				sys_ppoll
+262	n64	unshare				sys_unshare
+263	n64	splice				sys_splice
+264	n64	sync_file_range			sys_sync_file_range
+265	n64	tee				sys_tee
+266	n64	vmsplice			sys_vmsplice
+267	n64	move_pages			sys_move_pages
+268	n64	set_robust_list			sys_set_robust_list
+269	n64	get_robust_list			sys_get_robust_list
+270	n64	kexec_load			sys_kexec_load
+271	n64	getcpu				sys_getcpu
+272	n64	epoll_pwait			sys_epoll_pwait
+273	n64	ioprio_set			sys_ioprio_set
+274	n64	ioprio_get			sys_ioprio_get
+275	n64	utimensat			sys_utimensat
+276	n64	signalfd			sys_signalfd
+277	n64	timerfd				sys_ni_syscall
+278	n64	eventfd				sys_eventfd
+279	n64	fallocate			sys_fallocate
+280	n64	timerfd_create			sys_timerfd_create
+281	n64	timerfd_gettime			sys_timerfd_gettime
+282	n64	timerfd_settime			sys_timerfd_settime
+283	n64	signalfd4			sys_signalfd4
+284	n64	eventfd2			sys_eventfd2
+285	n64	epoll_create1			sys_epoll_create1
+286	n64	dup3				sys_dup3
+287	n64	pipe2				sys_pipe2
+288	n64	inotify_init1			sys_inotify_init1
+289	n64	preadv				sys_preadv
+290	n64	pwritev				sys_pwritev
+291	n64	rt_tgsigqueueinfo		sys_rt_tgsigqueueinfo
+292	n64	perf_event_open			sys_perf_event_open
+293	n64	accept4				sys_accept4
+294	n64	recvmmsg			sys_recvmmsg
+295	n64	fanotify_init			sys_fanotify_init
+296	n64	fanotify_mark			sys_fanotify_mark
+297	n64	prlimit64			sys_prlimit64
+298	n64	name_to_handle_at		sys_name_to_handle_at
+299	n64	open_by_handle_at		sys_open_by_handle_at
+300	n64	clock_adjtime			sys_clock_adjtime
+301	n64	syncfs				sys_syncfs
+302	n64	sendmmsg			sys_sendmmsg
+303	n64	setns				sys_setns
+304	n64	process_vm_readv		sys_process_vm_readv
+305	n64	process_vm_writev		sys_process_vm_writev
+306	n64	kcmp				sys_kcmp
+307	n64	finit_module			sys_finit_module
+308	n64	getdents64			sys_getdents64
+309	n64	sched_setattr			sys_sched_setattr
+310	n64	sched_getattr			sys_sched_getattr
+311	n64	renameat2			sys_renameat2
+312	n64	seccomp				sys_seccomp
+313	n64	getrandom			sys_getrandom
+314	n64	memfd_create			sys_memfd_create
+315	n64	bpf				sys_bpf
+316	n64	execveat			sys_execveat
+317	n64	userfaultfd			sys_userfaultfd
+318	n64	membarrier			sys_membarrier
+319	n64	mlock2				sys_mlock2
+320	n64	copy_file_range			sys_copy_file_range
+321	n64	preadv2				sys_preadv2
+322	n64	pwritev2			sys_pwritev2
+323	n64	pkey_mprotect			sys_pkey_mprotect
+324	n64	pkey_alloc			sys_pkey_alloc
+325	n64	pkey_free			sys_pkey_free
+326	n64	statx				sys_statx
+327	n64	rseq				sys_rseq
+328	n64	io_pgetevents			sys_io_pgetevents
+# 329 through 423 are reserved to sync up with other architectures
+424	n64	pidfd_send_signal		sys_pidfd_send_signal
+425	n64	io_uring_setup			sys_io_uring_setup
+426	n64	io_uring_enter			sys_io_uring_enter
+427	n64	io_uring_register		sys_io_uring_register
+428	n64	open_tree			sys_open_tree
+429	n64	move_mount			sys_move_mount
+430	n64	fsopen				sys_fsopen
+431	n64	fsconfig			sys_fsconfig
+432	n64	fsmount				sys_fsmount
+433	n64	fspick				sys_fspick
+434	n64	pidfd_open			sys_pidfd_open
+435	n64	clone3				__sys_clone3
+436	n64	close_range			sys_close_range
+437	n64	openat2				sys_openat2
+438	n64	pidfd_getfd			sys_pidfd_getfd
+439	n64	faccessat2			sys_faccessat2
+440	n64	process_madvise			sys_process_madvise
+441	n64	epoll_pwait2			sys_epoll_pwait2
diff --git a/tools/perf/arch/mips/include/dwarf-regs-table.h b/tools/perf/arch/mips/include/dwarf-regs-table.h
new file mode 100644
index 000000000000..5badbcd3c5ec
--- /dev/null
+++ b/tools/perf/arch/mips/include/dwarf-regs-table.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * dwarf-regs-table.h : Mapping of DWARF debug register numbers into
+ * register names.
+ *
+ * Copyright (C) 2013 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifdef DEFINE_DWARF_REGSTR_TABLE
+#undef REG_DWARFNUM_NAME
+#define REG_DWARFNUM_NAME(reg, idx)	[idx] = "$" #reg
+static const char * const mips_regstr_tbl[] = {
+	"$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", "$9",
+	"$10", "$11", "$12", "$13", "$14", "$15", "$16", "$17", "$18", "$19",
+	"$20", "$21", "$22", "$23", "$24", "$25", "$26", "$27", "$28", "%29",
+	"$30", "$31",
+	REG_DWARFNUM_NAME(hi, 64),
+	REG_DWARFNUM_NAME(lo, 65),
+};
+#endif
diff --git a/tools/perf/arch/mips/include/perf_regs.h b/tools/perf/arch/mips/include/perf_regs.h
new file mode 100644
index 000000000000..ee73b36a14d1
--- /dev/null
+++ b/tools/perf/arch/mips/include/perf_regs.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include <linux/types.h>
+#include <asm/perf_regs.h>
+
+#define PERF_REGS_MAX PERF_REG_MIPS_MAX
+#define PERF_REG_IP PERF_REG_MIPS_PC
+#define PERF_REG_SP PERF_REG_MIPS_R29
+
+#define PERF_REGS_MASK ((1ULL << PERF_REG_MIPS_MAX) - 1)
+
+static inline const char *__perf_reg_name(int id)
+{
+	switch (id) {
+	case PERF_REG_MIPS_PC:
+		return "PC";
+	case PERF_REG_MIPS_R1:
+		return "$1";
+	case PERF_REG_MIPS_R2:
+		return "$2";
+	case PERF_REG_MIPS_R3:
+		return "$3";
+	case PERF_REG_MIPS_R4:
+		return "$4";
+	case PERF_REG_MIPS_R5:
+		return "$5";
+	case PERF_REG_MIPS_R6:
+		return "$6";
+	case PERF_REG_MIPS_R7:
+		return "$7";
+	case PERF_REG_MIPS_R8:
+		return "$8";
+	case PERF_REG_MIPS_R9:
+		return "$9";
+	case PERF_REG_MIPS_R10:
+		return "$10";
+	case PERF_REG_MIPS_R11:
+		return "$11";
+	case PERF_REG_MIPS_R12:
+		return "$12";
+	case PERF_REG_MIPS_R13:
+		return "$13";
+	case PERF_REG_MIPS_R14:
+		return "$14";
+	case PERF_REG_MIPS_R15:
+		return "$15";
+	case PERF_REG_MIPS_R16:
+		return "$16";
+	case PERF_REG_MIPS_R17:
+		return "$17";
+	case PERF_REG_MIPS_R18:
+		return "$18";
+	case PERF_REG_MIPS_R19:
+		return "$19";
+	case PERF_REG_MIPS_R20:
+		return "$20";
+	case PERF_REG_MIPS_R21:
+		return "$21";
+	case PERF_REG_MIPS_R22:
+		return "$22";
+	case PERF_REG_MIPS_R23:
+		return "$23";
+	case PERF_REG_MIPS_R24:
+		return "$24";
+	case PERF_REG_MIPS_R25:
+		return "$25";
+	case PERF_REG_MIPS_R28:
+		return "$28";
+	case PERF_REG_MIPS_R29:
+		return "$29";
+	case PERF_REG_MIPS_R30:
+		return "$30";
+	case PERF_REG_MIPS_R31:
+		return "$31";
+	default:
+		break;
+	}
+	return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/mips/util/Build b/tools/perf/arch/mips/util/Build
new file mode 100644
index 000000000000..51c8900a9a10
--- /dev/null
+++ b/tools/perf/arch/mips/util/Build
@@ -0,0 +1,3 @@
+perf-y += perf_regs.o
+perf-$(CONFIG_DWARF) += dwarf-regs.o
+perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/mips/util/dwarf-regs.c b/tools/perf/arch/mips/util/dwarf-regs.c
new file mode 100644
index 000000000000..25c13a91c2a7
--- /dev/null
+++ b/tools/perf/arch/mips/util/dwarf-regs.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2013 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <stdio.h>
+#include <dwarf-regs.h>
+
+static const char *mips_gpr_names[32] = {
+	"$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", "$9",
+	"$10", "$11", "$12", "$13", "$14", "$15", "$16", "$17", "$18", "$19",
+	"$20", "$21", "$22", "$23", "$24", "$25", "$26", "$27", "$28", "$29",
+	"$30", "$31"
+};
+
+const char *get_arch_regstr(unsigned int n)
+{
+	if (n < 32)
+		return mips_gpr_names[n];
+	if (n == 64)
+		return "hi";
+	if (n == 65)
+		return "lo";
+	return NULL;
+}
diff --git a/tools/perf/arch/mips/util/perf_regs.c b/tools/perf/arch/mips/util/perf_regs.c
new file mode 100644
index 000000000000..2864e2e3776d
--- /dev/null
+++ b/tools/perf/arch/mips/util/perf_regs.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../../util/perf_regs.h"
+
+const struct sample_reg sample_reg_masks[] = {
+	SMPL_REG_END
+};
diff --git a/tools/perf/arch/mips/util/unwind-libunwind.c b/tools/perf/arch/mips/util/unwind-libunwind.c
new file mode 100644
index 000000000000..0d8c99c29da6
--- /dev/null
+++ b/tools/perf/arch/mips/util/unwind-libunwind.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+#include "util/debug.h"
+
+int libunwind__arch_reg_id(int regnum)
+{
+	switch (regnum) {
+	case UNW_MIPS_R1 ... UNW_MIPS_R25:
+		return regnum - UNW_MIPS_R1 + PERF_REG_MIPS_R1;
+	case UNW_MIPS_R28 ... UNW_MIPS_R31:
+		return regnum - UNW_MIPS_R28 + PERF_REG_MIPS_R28;
+	case UNW_MIPS_PC:
+		return PERF_REG_MIPS_PC;
+	default:
+		pr_err("unwind: invalid reg id %d\n", regnum);
+		return -EINVAL;
+	}
+}
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index b7945e5a543b..8a79c4126e5b 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -4,6 +4,8 @@ perf-y += kvm-stat.o
 perf-y += perf_regs.o
 perf-y += mem-events.o
 perf-y += sym-handling.o
+perf-y += evsel.o
+perf-y += event.o
 
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/event.c b/tools/perf/arch/powerpc/util/event.c
new file mode 100644
index 000000000000..3bf441257466
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/event.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+
+#include "../../../util/event.h"
+#include "../../../util/synthetic-events.h"
+#include "../../../util/machine.h"
+#include "../../../util/tool.h"
+#include "../../../util/map.h"
+#include "../../../util/debug.h"
+
+void arch_perf_parse_sample_weight(struct perf_sample *data,
+				   const __u64 *array, u64 type)
+{
+	union perf_sample_weight weight;
+
+	weight.full = *array;
+	if (type & PERF_SAMPLE_WEIGHT)
+		data->weight = weight.full;
+	else {
+		data->weight = weight.var1_dw;
+		data->ins_lat = weight.var2_w;
+		data->p_stage_cyc = weight.var3_w;
+	}
+}
+
+void arch_perf_synthesize_sample_weight(const struct perf_sample *data,
+					__u64 *array, u64 type)
+{
+	*array = data->weight;
+
+	if (type & PERF_SAMPLE_WEIGHT_STRUCT) {
+		*array &= 0xffffffff;
+		*array |= ((u64)data->ins_lat << 32);
+	}
+}
+
+const char *arch_perf_header_entry(const char *se_header)
+{
+	if (!strcmp(se_header, "Local INSTR Latency"))
+		return "Finish Cyc";
+	else if (!strcmp(se_header, "Pipeline Stage Cycle"))
+		return "Dispatch Cyc";
+	return se_header;
+}
+
+int arch_support_sort_key(const char *sort_key)
+{
+	if (!strcmp(sort_key, "p_stage_cyc"))
+		return 1;
+	return 0;
+}
diff --git a/tools/perf/arch/powerpc/util/evsel.c b/tools/perf/arch/powerpc/util/evsel.c
new file mode 100644
index 000000000000..2f733cdc8dbb
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/evsel.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include "util/evsel.h"
+
+void arch_evsel__set_sample_weight(struct evsel *evsel)
+{
+	evsel__set_sample_bit(evsel, WEIGHT_STRUCT);
+}
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
index eed9e5a42935..16510686c138 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -176,7 +176,7 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
 }
 
 /*
- * Incase of powerpc architecture, pmu registers are programmable
+ * In case of powerpc architecture, pmu registers are programmable
  * by guest kernel. So monitoring guest via host may not provide
  * valid samples with default 'cycles' event. It is better to use
  * 'trace_imc/trace_cycles' event for guest profiling, since it
diff --git a/tools/perf/arch/powerpc/util/utils_header.h b/tools/perf/arch/powerpc/util/utils_header.h
index 5788eb1f1fe3..2baeb1c1ae85 100644
--- a/tools/perf/arch/powerpc/util/utils_header.h
+++ b/tools/perf/arch/powerpc/util/utils_header.h
@@ -10,6 +10,6 @@
 
 #define SPRN_PVR        0x11F   /* Processor Version Register */
 #define PVR_VER(pvr)    (((pvr) >>  16) & 0xFFFF) /* Version field */
-#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revison field */
+#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revision field */
 
 #endif /* __PERF_UTIL_HEADER_H */
diff --git a/tools/perf/arch/x86/tests/bp-modify.c b/tools/perf/arch/x86/tests/bp-modify.c
index adcacf1b6609..dffcf9b52153 100644
--- a/tools/perf/arch/x86/tests/bp-modify.c
+++ b/tools/perf/arch/x86/tests/bp-modify.c
@@ -73,7 +73,7 @@ static int bp_modify1(void)
 	/*
 	 * The parent does following steps:
 	 *  - creates a new breakpoint (id 0) for bp_2 function
-	 *  - changes that breakponit to bp_1 function
+	 *  - changes that breakpoint to bp_1 function
 	 *  - waits for the breakpoint to hit and checks
 	 *    it has proper rip of bp_1 function
 	 *  - detaches the child
diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build
index 0c72d418932e..dbeb04cb336e 100644
--- a/tools/perf/arch/x86/util/Build
+++ b/tools/perf/arch/x86/util/Build
@@ -9,6 +9,7 @@ perf-y += event.o
 perf-y += evlist.o
 perf-y += mem-events.o
 perf-y += evsel.o
+perf-y += iostat.o
 
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o
diff --git a/tools/perf/arch/x86/util/iostat.c b/tools/perf/arch/x86/util/iostat.c
new file mode 100644
index 000000000000..d63acb782b63
--- /dev/null
+++ b/tools/perf/arch/x86/util/iostat.c
@@ -0,0 +1,470 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov <alexander.antonov@linux.intel.com>
+ */
+
+#include <api/fs/fs.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <regex.h>
+#include "util/cpumap.h"
+#include "util/debug.h"
+#include "util/iostat.h"
+#include "util/counts.h"
+#include "path.h"
+
+#ifndef MAX_PATH
+#define MAX_PATH 1024
+#endif
+
+#define UNCORE_IIO_PMU_PATH	"devices/uncore_iio_%d"
+#define SYSFS_UNCORE_PMU_PATH	"%s/"UNCORE_IIO_PMU_PATH
+#define PLATFORM_MAPPING_PATH	UNCORE_IIO_PMU_PATH"/die%d"
+
+/*
+ * Each metric requiries one IIO event which increments at every 4B transfer
+ * in corresponding direction. The formulas to compute metrics are generic:
+ *     #EventCount * 4B / (1024 * 1024)
+ */
+static const char * const iostat_metrics[] = {
+	"Inbound Read(MB)",
+	"Inbound Write(MB)",
+	"Outbound Read(MB)",
+	"Outbound Write(MB)",
+};
+
+static inline int iostat_metrics_count(void)
+{
+	return sizeof(iostat_metrics) / sizeof(char *);
+}
+
+static const char *iostat_metric_by_idx(int idx)
+{
+	return *(iostat_metrics + idx % iostat_metrics_count());
+}
+
+struct iio_root_port {
+	u32 domain;
+	u8 bus;
+	u8 die;
+	u8 pmu_idx;
+	int idx;
+};
+
+struct iio_root_ports_list {
+	struct iio_root_port **rps;
+	int nr_entries;
+};
+
+static struct iio_root_ports_list *root_ports;
+
+static void iio_root_port_show(FILE *output,
+			       const struct iio_root_port * const rp)
+{
+	if (output && rp)
+		fprintf(output, "S%d-uncore_iio_%d<%04x:%02x>\n",
+			rp->die, rp->pmu_idx, rp->domain, rp->bus);
+}
+
+static struct iio_root_port *iio_root_port_new(u32 domain, u8 bus,
+					       u8 die, u8 pmu_idx)
+{
+	struct iio_root_port *p = calloc(1, sizeof(*p));
+
+	if (p) {
+		p->domain = domain;
+		p->bus = bus;
+		p->die = die;
+		p->pmu_idx = pmu_idx;
+	}
+	return p;
+}
+
+static void iio_root_ports_list_free(struct iio_root_ports_list *list)
+{
+	int idx;
+
+	if (list) {
+		for (idx = 0; idx < list->nr_entries; idx++)
+			free(list->rps[idx]);
+		free(list->rps);
+		free(list);
+	}
+}
+
+static struct iio_root_port *iio_root_port_find_by_notation(
+	const struct iio_root_ports_list * const list, u32 domain, u8 bus)
+{
+	int idx;
+	struct iio_root_port *rp;
+
+	if (list) {
+		for (idx = 0; idx < list->nr_entries; idx++) {
+			rp = list->rps[idx];
+			if (rp && rp->domain == domain && rp->bus == bus)
+				return rp;
+		}
+	}
+	return NULL;
+}
+
+static int iio_root_ports_list_insert(struct iio_root_ports_list *list,
+				      struct iio_root_port * const rp)
+{
+	struct iio_root_port **tmp_buf;
+
+	if (list && rp) {
+		rp->idx = list->nr_entries++;
+		tmp_buf = realloc(list->rps,
+				  list->nr_entries * sizeof(*list->rps));
+		if (!tmp_buf) {
+			pr_err("Failed to realloc memory\n");
+			return -ENOMEM;
+		}
+		tmp_buf[rp->idx] = rp;
+		list->rps = tmp_buf;
+	}
+	return 0;
+}
+
+static int iio_mapping(u8 pmu_idx, struct iio_root_ports_list * const list)
+{
+	char *buf;
+	char path[MAX_PATH];
+	u32 domain;
+	u8 bus;
+	struct iio_root_port *rp;
+	size_t size;
+	int ret;
+
+	for (int die = 0; die < cpu__max_node(); die++) {
+		scnprintf(path, MAX_PATH, PLATFORM_MAPPING_PATH, pmu_idx, die);
+		if (sysfs__read_str(path, &buf, &size) < 0) {
+			if (pmu_idx)
+				goto out;
+			pr_err("Mode iostat is not supported\n");
+			return -1;
+		}
+		ret = sscanf(buf, "%04x:%02hhx", &domain, &bus);
+		free(buf);
+		if (ret != 2) {
+			pr_err("Invalid mapping data: iio_%d; die%d\n",
+			       pmu_idx, die);
+			return -1;
+		}
+		rp = iio_root_port_new(domain, bus, die, pmu_idx);
+		if (!rp || iio_root_ports_list_insert(list, rp)) {
+			free(rp);
+			return -ENOMEM;
+		}
+	}
+out:
+	return 0;
+}
+
+static u8 iio_pmu_count(void)
+{
+	u8 pmu_idx = 0;
+	char path[MAX_PATH];
+	const char *sysfs = sysfs__mountpoint();
+
+	if (sysfs) {
+		for (;; pmu_idx++) {
+			snprintf(path, sizeof(path), SYSFS_UNCORE_PMU_PATH,
+				 sysfs, pmu_idx);
+			if (access(path, F_OK) != 0)
+				break;
+		}
+	}
+	return pmu_idx;
+}
+
+static int iio_root_ports_scan(struct iio_root_ports_list **list)
+{
+	int ret = -ENOMEM;
+	struct iio_root_ports_list *tmp_list;
+	u8 pmu_count = iio_pmu_count();
+
+	if (!pmu_count) {
+		pr_err("Unsupported uncore pmu configuration\n");
+		return -1;
+	}
+
+	tmp_list = calloc(1, sizeof(*tmp_list));
+	if (!tmp_list)
+		goto err;
+
+	for (u8 pmu_idx = 0; pmu_idx < pmu_count; pmu_idx++) {
+		ret = iio_mapping(pmu_idx, tmp_list);
+		if (ret)
+			break;
+	}
+err:
+	if (!ret)
+		*list = tmp_list;
+	else
+		iio_root_ports_list_free(tmp_list);
+
+	return ret;
+}
+
+static int iio_root_port_parse_str(u32 *domain, u8 *bus, char *str)
+{
+	int ret;
+	regex_t regex;
+	/*
+	 * Expected format domain:bus:
+	 * Valid domain range [0:ffff]
+	 * Valid bus range [0:ff]
+	 * Example: 0000:af, 0:3d, 01:7
+	 */
+	regcomp(&regex, "^([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})", REG_EXTENDED);
+	ret = regexec(&regex, str, 0, NULL, 0);
+	if (ret || sscanf(str, "%08x:%02hhx", domain, bus) != 2)
+		pr_warning("Unrecognized root port format: %s\n"
+			   "Please use the following format:\n"
+			   "\t [domain]:[bus]\n"
+			   "\t for example: 0000:3d\n", str);
+
+	regfree(&regex);
+	return ret;
+}
+
+static int iio_root_ports_list_filter(struct iio_root_ports_list **list,
+				      const char *filter)
+{
+	char *tok, *tmp, *filter_copy = NULL;
+	struct iio_root_port *rp;
+	u32 domain;
+	u8 bus;
+	int ret = -ENOMEM;
+	struct iio_root_ports_list *tmp_list = calloc(1, sizeof(*tmp_list));
+
+	if (!tmp_list)
+		goto err;
+
+	filter_copy = strdup(filter);
+	if (!filter_copy)
+		goto err;
+
+	for (tok = strtok_r(filter_copy, ",", &tmp); tok;
+	     tok = strtok_r(NULL, ",", &tmp)) {
+		if (!iio_root_port_parse_str(&domain, &bus, tok)) {
+			rp = iio_root_port_find_by_notation(*list, domain, bus);
+			if (rp) {
+				(*list)->rps[rp->idx] = NULL;
+				ret = iio_root_ports_list_insert(tmp_list, rp);
+				if (ret) {
+					free(rp);
+					goto err;
+				}
+			} else if (!iio_root_port_find_by_notation(tmp_list,
+								   domain, bus))
+				pr_warning("Root port %04x:%02x were not found\n",
+					   domain, bus);
+		}
+	}
+
+	if (tmp_list->nr_entries == 0) {
+		pr_err("Requested root ports were not found\n");
+		ret = -EINVAL;
+	}
+err:
+	iio_root_ports_list_free(*list);
+	if (ret)
+		iio_root_ports_list_free(tmp_list);
+	else
+		*list = tmp_list;
+
+	free(filter_copy);
+	return ret;
+}
+
+static int iostat_event_group(struct evlist *evl,
+			      struct iio_root_ports_list *list)
+{
+	int ret;
+	int idx;
+	const char *iostat_cmd_template =
+	"{uncore_iio_%x/event=0x83,umask=0x04,ch_mask=0xF,fc_mask=0x07/,\
+	  uncore_iio_%x/event=0x83,umask=0x01,ch_mask=0xF,fc_mask=0x07/,\
+	  uncore_iio_%x/event=0xc0,umask=0x04,ch_mask=0xF,fc_mask=0x07/,\
+	  uncore_iio_%x/event=0xc0,umask=0x01,ch_mask=0xF,fc_mask=0x07/}";
+	const int len_template = strlen(iostat_cmd_template) + 1;
+	struct evsel *evsel = NULL;
+	int metrics_count = iostat_metrics_count();
+	char *iostat_cmd = calloc(len_template, 1);
+
+	if (!iostat_cmd)
+		return -ENOMEM;
+
+	for (idx = 0; idx < list->nr_entries; idx++) {
+		sprintf(iostat_cmd, iostat_cmd_template,
+			list->rps[idx]->pmu_idx, list->rps[idx]->pmu_idx,
+			list->rps[idx]->pmu_idx, list->rps[idx]->pmu_idx);
+		ret = parse_events(evl, iostat_cmd, NULL);
+		if (ret)
+			goto err;
+	}
+
+	evlist__for_each_entry(evl, evsel) {
+		evsel->priv = list->rps[evsel->idx / metrics_count];
+	}
+	list->nr_entries = 0;
+err:
+	iio_root_ports_list_free(list);
+	free(iostat_cmd);
+	return ret;
+}
+
+int iostat_prepare(struct evlist *evlist, struct perf_stat_config *config)
+{
+	if (evlist->core.nr_entries > 0) {
+		pr_warning("The -e and -M options are not supported."
+			   "All chosen events/metrics will be dropped\n");
+		evlist__delete(evlist);
+		evlist = evlist__new();
+		if (!evlist)
+			return -ENOMEM;
+	}
+
+	config->metric_only = true;
+	config->aggr_mode = AGGR_GLOBAL;
+
+	return iostat_event_group(evlist, root_ports);
+}
+
+int iostat_parse(const struct option *opt, const char *str,
+		 int unset __maybe_unused)
+{
+	int ret;
+	struct perf_stat_config *config = (struct perf_stat_config *)opt->data;
+
+	ret = iio_root_ports_scan(&root_ports);
+	if (!ret) {
+		config->iostat_run = true;
+		if (!str)
+			iostat_mode = IOSTAT_RUN;
+		else if (!strcmp(str, "list"))
+			iostat_mode = IOSTAT_LIST;
+		else {
+			iostat_mode = IOSTAT_RUN;
+			ret = iio_root_ports_list_filter(&root_ports, str);
+		}
+	}
+	return ret;
+}
+
+void iostat_list(struct evlist *evlist, struct perf_stat_config *config)
+{
+	struct evsel *evsel;
+	struct iio_root_port *rp = NULL;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (rp != evsel->priv) {
+			rp = evsel->priv;
+			iio_root_port_show(config->output, rp);
+		}
+	}
+}
+
+void iostat_release(struct evlist *evlist)
+{
+	struct evsel *evsel;
+	struct iio_root_port *rp = NULL;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (rp != evsel->priv) {
+			rp = evsel->priv;
+			free(evsel->priv);
+		}
+	}
+}
+
+void iostat_prefix(struct evlist *evlist,
+		   struct perf_stat_config *config,
+		   char *prefix, struct timespec *ts)
+{
+	struct iio_root_port *rp = evlist->selected->priv;
+
+	if (rp) {
+		if (ts)
+			sprintf(prefix, "%6lu.%09lu%s%04x:%02x%s",
+				ts->tv_sec, ts->tv_nsec,
+				config->csv_sep, rp->domain, rp->bus,
+				config->csv_sep);
+		else
+			sprintf(prefix, "%04x:%02x%s", rp->domain, rp->bus,
+				config->csv_sep);
+	}
+}
+
+void iostat_print_header_prefix(struct perf_stat_config *config)
+{
+	if (config->csv_output)
+		fputs("port,", config->output);
+	else if (config->interval)
+		fprintf(config->output, "#          time    port         ");
+	else
+		fprintf(config->output, "   port         ");
+}
+
+void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
+			 struct perf_stat_output_ctx *out)
+{
+	double iostat_value = 0;
+	u64 prev_count_val = 0;
+	const char *iostat_metric = iostat_metric_by_idx(evsel->idx);
+	u8 die = ((struct iio_root_port *)evsel->priv)->die;
+	struct perf_counts_values *count = perf_counts(evsel->counts, die, 0);
+
+	if (count->run && count->ena) {
+		if (evsel->prev_raw_counts && !out->force_header) {
+			struct perf_counts_values *prev_count =
+				perf_counts(evsel->prev_raw_counts, die, 0);
+
+			prev_count_val = prev_count->val;
+			prev_count->val = count->val;
+		}
+		iostat_value = (count->val - prev_count_val) /
+			       ((double) count->run / count->ena);
+	}
+	out->print_metric(config, out->ctx, NULL, "%8.0f", iostat_metric,
+			  iostat_value / (256 * 1024));
+}
+
+void iostat_print_counters(struct evlist *evlist,
+			   struct perf_stat_config *config, struct timespec *ts,
+			   char *prefix, iostat_print_counter_t print_cnt_cb)
+{
+	void *perf_device = NULL;
+	struct evsel *counter = evlist__first(evlist);
+
+	evlist__set_selected(evlist, counter);
+	iostat_prefix(evlist, config, prefix, ts);
+	fprintf(config->output, "%s", prefix);
+	evlist__for_each_entry(evlist, counter) {
+		perf_device = evlist->selected->priv;
+		if (perf_device && perf_device != counter->priv) {
+			evlist__set_selected(evlist, counter);
+			iostat_prefix(evlist, config, prefix, ts);
+			fprintf(config->output, "\n%s", prefix);
+		}
+		print_cnt_cb(config, counter, prefix);
+	}
+	fputc('\n', config->output);
+}
diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c
index fca81b39b09f..207c56805c55 100644
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ b/tools/perf/arch/x86/util/perf_regs.c
@@ -165,7 +165,7 @@ static int sdt_init_op_regex(void)
 /*
  * Max x86 register name length is 5(ex: %r15d). So, 6th char
  * should always contain NULL. This helps to find register name
- * length using strlen, insted of maintaing one more variable.
+ * length using strlen, instead of maintaining one more variable.
  */
 #define SDT_REG_NAME_SIZE  6
 
@@ -207,7 +207,7 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
 	 * and displacement 0 (Both sign and displacement 0 are
 	 * optional so it may be empty). Use one more character
 	 * to hold last NULL so that strlen can be used to find
-	 * prefix length, instead of maintaing one more variable.
+	 * prefix length, instead of maintaining one more variable.
 	 */
 	char prefix[3] = {0};
 
diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c
index 0a0ff1247c83..79d13dbc0a47 100644
--- a/tools/perf/bench/epoll-wait.c
+++ b/tools/perf/bench/epoll-wait.c
@@ -17,7 +17,7 @@
  * While the second model, enabled via --multiq option, uses multiple
  * queueing (which refers to one epoll instance per worker). For example,
  * short lived tcp connections in a high throughput httpd server will
- * ditribute the accept()'ing  connections across CPUs. In this case each
+ * distribute the accept()'ing  connections across CPUs. In this case each
  * worker does a limited  amount of processing.
  *
  *             [queue A]  ---> [worker]
@@ -198,7 +198,7 @@ static void *workerfn(void *arg)
 
 	do {
 		/*
-		 * Block undefinitely waiting for the IN event.
+		 * Block indefinitely waiting for the IN event.
 		 * In order to stress the epoll_wait(2) syscall,
 		 * call it event per event, instead of a larger
 		 * batch (max)limit.
diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c
index 280227e3ffd7..55d373b75791 100644
--- a/tools/perf/bench/inject-buildid.c
+++ b/tools/perf/bench/inject-buildid.c
@@ -372,7 +372,7 @@ static int inject_build_id(struct bench_data *data, u64 *max_rss)
 			len += synthesize_flush(data);
 	}
 
-	/* tihs makes the child to finish */
+	/* this makes the child to finish */
 	close(data->input_pipe[1]);
 
 	wait4(data->pid, &status, 0, &rusage);
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 20b87e29c96f..f2640179ada9 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -42,7 +42,7 @@
 #endif
 
 /*
- * Regular printout to the terminal, supressed if -q is specified:
+ * Regular printout to the terminal, suppressed if -q is specified:
  */
 #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
 
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index a23ba6bb99b6..49627a7bed7c 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -239,7 +239,7 @@ static int evsel__add_sample(struct evsel *evsel, struct perf_sample *sample,
 	}
 
 	/*
-	 * XXX filtered samples can still have branch entires pointing into our
+	 * XXX filtered samples can still have branch entries pointing into our
 	 * symbol and are missed.
 	 */
 	process_branch_stack(sample->branch_stack, al, sample);
@@ -374,13 +374,6 @@ find_next:
 		} else {
 			hist_entry__tty_annotate(he, evsel, ann);
 			nd = rb_next(nd);
-			/*
-			 * Since we have a hist_entry per IP for the same
-			 * symbol, free he->ms.sym->src to signal we already
-			 * processed this symbol.
-			 */
-			zfree(&notes->src->cycles_hist);
-			zfree(&notes->src);
 		}
 	}
 }
@@ -411,8 +404,8 @@ static int __cmd_annotate(struct perf_annotate *ann)
 		goto out;
 
 	if (dump_trace) {
-		perf_session__fprintf_nr_events(session, stdout);
-		evlist__fprintf_nr_events(session->evlist, stdout);
+		perf_session__fprintf_nr_events(session, stdout, false);
+		evlist__fprintf_nr_events(session->evlist, stdout, false);
 		goto out;
 	}
 
@@ -425,7 +418,7 @@ static int __cmd_annotate(struct perf_annotate *ann)
 	total_nr_samples = 0;
 	evlist__for_each_entry(session->evlist, pos) {
 		struct hists *hists = evsel__hists(pos);
-		u32 nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+		u32 nr_samples = hists->stats.nr_samples;
 
 		if (nr_samples > 0) {
 			total_nr_samples += nr_samples;
@@ -538,6 +531,10 @@ int cmd_annotate(int argc, const char **argv)
 		    "Strip first N entries of source file path name in programs (with --prefix)"),
 	OPT_STRING(0, "objdump", &annotate.opts.objdump_path, "path",
 		   "objdump binary to use for disassembly and annotations"),
+	OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
+		    "Enable symbol demangling"),
+	OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel,
+		    "Enable kernel symbol demangling"),
 	OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
 		    "Show event group information together"),
 	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
@@ -619,14 +616,22 @@ int cmd_annotate(int argc, const char **argv)
 
 	setup_browser(true);
 
-	if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack) {
+	/*
+	 * Events of different processes may correspond to the same
+	 * symbol, we do not care about the processes in annotate,
+	 * set sort order to avoid repeated output.
+	 */
+	sort_order = "dso,symbol";
+
+	/*
+	 * Set SORT_MODE__BRANCH so that annotate display IPC/Cycle
+	 * if branch info is in perf data in TUI mode.
+	 */
+	if ((use_browser == 1 || annotate.use_stdio2) && annotate.has_br_stack)
 		sort__mode = SORT_MODE__BRANCH;
-		if (setup_sorting(annotate.session->evlist) < 0)
-			usage_with_options(annotate_usage, options);
-	} else {
-		if (setup_sorting(NULL) < 0)
-			usage_with_options(annotate_usage, options);
-	}
+
+	if (setup_sorting(NULL) < 0)
+		usage_with_options(annotate_usage, options);
 
 	ret = __cmd_annotate(&annotate);
 
diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c
index 7c4a9d424a64..61929f63a047 100644
--- a/tools/perf/builtin-daemon.c
+++ b/tools/perf/builtin-daemon.c
@@ -6,7 +6,6 @@
 #include <linux/zalloc.h>
 #include <linux/string.h>
 #include <linux/limits.h>
-#include <linux/string.h>
 #include <string.h>
 #include <sys/file.h>
 #include <signal.h>
@@ -24,8 +23,6 @@
 #include <sys/signalfd.h>
 #include <sys/wait.h>
 #include <poll.h>
-#include <sys/stat.h>
-#include <time.h>
 #include "builtin.h"
 #include "perf.h"
 #include "debug.h"
diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c
index 8d23b8d6ee8e..15ca23675ef0 100644
--- a/tools/perf/builtin-data.c
+++ b/tools/perf/builtin-data.c
@@ -7,7 +7,6 @@
 #include "debug.h"
 #include <subcmd/parse-options.h>
 #include "data-convert.h"
-#include "data-convert-bt.h"
 
 typedef int (*data_cmd_fn_t)(int argc, const char **argv);
 
@@ -55,7 +54,8 @@ static const char * const data_convert_usage[] = {
 
 static int cmd_data_convert(int argc, const char **argv)
 {
-	const char *to_ctf     = NULL;
+	const char *to_json = NULL;
+	const char *to_ctf = NULL;
 	struct perf_data_convert_opts opts = {
 		.force = false,
 		.all = false,
@@ -63,6 +63,7 @@ static int cmd_data_convert(int argc, const char **argv)
 	const struct option options[] = {
 		OPT_INCR('v', "verbose", &verbose, "be more verbose"),
 		OPT_STRING('i', "input", &input_name, "file", "input file name"),
+		OPT_STRING(0, "to-json", &to_json, NULL, "Convert to JSON format"),
 #ifdef HAVE_LIBBABELTRACE_SUPPORT
 		OPT_STRING(0, "to-ctf", &to_ctf, NULL, "Convert to CTF format"),
 		OPT_BOOLEAN(0, "tod", &opts.tod, "Convert time to wall clock time"),
@@ -72,11 +73,6 @@ static int cmd_data_convert(int argc, const char **argv)
 		OPT_END()
 	};
 
-#ifndef HAVE_LIBBABELTRACE_SUPPORT
-	pr_err("No conversion support compiled in. perf should be compiled with environment variables LIBBABELTRACE=1 and LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n");
-	return -1;
-#endif
-
 	argc = parse_options(argc, argv, options,
 			     data_convert_usage, 0);
 	if (argc) {
@@ -84,11 +80,25 @@ static int cmd_data_convert(int argc, const char **argv)
 		return -1;
 	}
 
+	if (to_json && to_ctf) {
+		pr_err("You cannot specify both --to-ctf and --to-json.\n");
+		return -1;
+	}
+	if (!to_json && !to_ctf) {
+		pr_err("You must specify one of --to-ctf or --to-json.\n");
+		return -1;
+	}
+
+	if (to_json)
+		return bt_convert__perf2json(input_name, to_json, &opts);
+
 	if (to_ctf) {
 #ifdef HAVE_LIBBABELTRACE_SUPPORT
 		return bt_convert__perf2ctf(input_name, to_ctf, &opts);
 #else
-		pr_err("The libbabeltrace support is not compiled in.\n");
+		pr_err("The libbabeltrace support is not compiled in. perf should be "
+		       "compiled with environment variables LIBBABELTRACE=1 and "
+		       "LIBBABELTRACE_DIR=/path/to/libbabeltrace/\n");
 		return -1;
 #endif
 	}
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 878e04b1fab7..f52b3a799e76 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1796,7 +1796,7 @@ static int ui_init(void)
 	data__for_each_file(i, d) {
 
 		/*
-		 * Baseline or compute realted columns:
+		 * Baseline or compute related columns:
 		 *
 		 *   PERF_HPP_DIFF__BASELINE
 		 *   PERF_HPP_DIFF__DELTA
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index a2f1e53f37a7..01326e370009 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -49,7 +49,7 @@ struct lock_stat {
 
 	/*
 	 * FIXME: evsel__intval() returns u64,
-	 * so address of lockdep_map should be dealed as 64bit.
+	 * so address of lockdep_map should be treated as 64bit.
 	 * Is there more better solution?
 	 */
 	void			*addr;		/* address of lockdep_map, used as ID */
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 35465d1db6dd..3337b5f93336 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -47,6 +47,8 @@
 #include "util/util.h"
 #include "util/pfm.h"
 #include "util/clockid.h"
+#include "util/pmu-hybrid.h"
+#include "util/evlist-hybrid.h"
 #include "asm/bug.h"
 #include "perf.h"
 
@@ -1603,6 +1605,32 @@ static void hit_auxtrace_snapshot_trigger(struct record *rec)
 	}
 }
 
+static void record__uniquify_name(struct record *rec)
+{
+	struct evsel *pos;
+	struct evlist *evlist = rec->evlist;
+	char *new_name;
+	int ret;
+
+	if (!perf_pmu__has_hybrid())
+		return;
+
+	evlist__for_each_entry(evlist, pos) {
+		if (!evsel__is_hybrid(pos))
+			continue;
+
+		if (strchr(pos->name, '/'))
+			continue;
+
+		ret = asprintf(&new_name, "%s/%s/",
+			       pos->pmu_name, pos->name);
+		if (ret) {
+			free(pos->name);
+			pos->name = new_name;
+		}
+	}
+}
+
 static int __cmd_record(struct record *rec, int argc, const char **argv)
 {
 	int err;
@@ -1707,6 +1735,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
 		rec->opts.sample_id = true;
 
+	record__uniquify_name(rec);
+
 	if (record__open(rec) != 0) {
 		err = -1;
 		goto out_child;
@@ -1977,9 +2007,13 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		record__auxtrace_snapshot_exit(rec);
 
 	if (forks && workload_exec_errno) {
-		char msg[STRERR_BUFSIZE];
+		char msg[STRERR_BUFSIZE], strevsels[2048];
 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
-		pr_err("Workload failed: %s\n", emsg);
+
+		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
+
+		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
+			strevsels, argv[0], emsg);
 		err = -1;
 		goto out_child;
 	}
@@ -2786,10 +2820,19 @@ int cmd_record(int argc, const char **argv)
 	if (record.opts.overwrite)
 		record.opts.tail_synthesize = true;
 
-	if (rec->evlist->core.nr_entries == 0 &&
-	    __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
-		pr_err("Not enough memory for event selector list\n");
-		goto out;
+	if (rec->evlist->core.nr_entries == 0) {
+		if (perf_pmu__has_hybrid()) {
+			err = evlist__add_default_hybrid(rec->evlist,
+							 !record.opts.no_samples);
+		} else {
+			err = __evlist__add_default(rec->evlist,
+						    !record.opts.no_samples);
+		}
+
+		if (err < 0) {
+			pr_err("Not enough memory for event selector list\n");
+			goto out;
+		}
 	}
 
 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 2a845d6cac09..36f9ccfeb38a 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -84,6 +84,8 @@ struct report {
 	bool			nonany_branch_mode;
 	bool			group_set;
 	bool			stitch_lbr;
+	bool			disable_order;
+	bool			skip_empty;
 	int			max_stack;
 	struct perf_read_values	show_threads_values;
 	struct annotation_options annotation_opts;
@@ -134,6 +136,11 @@ static int report__config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
+	if (!strcmp(var, "report.skip-empty")) {
+		rep->skip_empty = perf_config_bool(var, value);
+		return 0;
+	}
+
 	return 0;
 }
 
@@ -435,7 +442,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report
 {
 	size_t ret;
 	char unit;
-	unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+	unsigned long nr_samples = hists->stats.nr_samples;
 	u64 nr_events = hists->stats.total_period;
 	struct evsel *evsel = hists_to_evsel(hists);
 	char buf[512];
@@ -463,7 +470,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report
 				nr_samples += pos_hists->stats.nr_non_filtered_samples;
 				nr_events += pos_hists->stats.total_non_filtered_period;
 			} else {
-				nr_samples += pos_hists->stats.nr_events[PERF_RECORD_SAMPLE];
+				nr_samples += pos_hists->stats.nr_samples;
 				nr_events += pos_hists->stats.total_period;
 			}
 		}
@@ -529,6 +536,9 @@ static int evlist__tty_browse_hists(struct evlist *evlist, struct report *rep, c
 		if (symbol_conf.event_group && !evsel__is_group_leader(pos))
 			continue;
 
+		if (rep->skip_empty && !hists->stats.nr_samples)
+			continue;
+
 		hists__fprintf_nr_sample_events(hists, rep, evname, stdout);
 
 		if (rep->total_cycles_mode) {
@@ -707,9 +717,22 @@ static void report__output_resort(struct report *rep)
 	ui_progress__finish();
 }
 
+static int count_sample_event(struct perf_tool *tool __maybe_unused,
+			      union perf_event *event __maybe_unused,
+			      struct perf_sample *sample __maybe_unused,
+			      struct evsel *evsel,
+			      struct machine *machine __maybe_unused)
+{
+	struct hists *hists = evsel__hists(evsel);
+
+	hists__inc_nr_events(hists);
+	return 0;
+}
+
 static void stats_setup(struct report *rep)
 {
 	memset(&rep->tool, 0, sizeof(rep->tool));
+	rep->tool.sample = count_sample_event;
 	rep->tool.no_warn = true;
 }
 
@@ -717,7 +740,8 @@ static int stats_print(struct report *rep)
 {
 	struct perf_session *session = rep->session;
 
-	perf_session__fprintf_nr_events(session, stdout);
+	perf_session__fprintf_nr_events(session, stdout, rep->skip_empty);
+	evlist__fprintf_nr_events(session->evlist, stdout, rep->skip_empty);
 	return 0;
 }
 
@@ -929,8 +953,10 @@ static int __cmd_report(struct report *rep)
 			perf_session__fprintf_dsos(session, stdout);
 
 		if (dump_trace) {
-			perf_session__fprintf_nr_events(session, stdout);
-			evlist__fprintf_nr_events(session->evlist, stdout);
+			perf_session__fprintf_nr_events(session, stdout,
+							rep->skip_empty);
+			evlist__fprintf_nr_events(session->evlist, stdout,
+						  rep->skip_empty);
 			return 0;
 		}
 	}
@@ -1139,6 +1165,7 @@ int cmd_report(int argc, const char **argv)
 		.pretty_printing_style	 = "normal",
 		.socket_filter		 = -1,
 		.annotation_opts	 = annotation__default_options,
+		.skip_empty		 = true,
 	};
 	const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
@@ -1296,6 +1323,10 @@ int cmd_report(int argc, const char **argv)
 	OPTS_EVSWITCH(&report.evswitch),
 	OPT_BOOLEAN(0, "total-cycles", &report.total_cycles_mode,
 		    "Sort all blocks by 'Sampled Cycles%'"),
+	OPT_BOOLEAN(0, "disable-order", &report.disable_order,
+		    "Disable raw trace ordering"),
+	OPT_BOOLEAN(0, "skip-empty", &report.skip_empty,
+		    "Do not display empty (or dummy) events in the output"),
 	OPT_END()
 	};
 	struct perf_data data = {
@@ -1329,7 +1360,7 @@ int cmd_report(int argc, const char **argv)
 	if (report.mmaps_mode)
 		report.tasks_mode = true;
 
-	if (dump_trace)
+	if (dump_trace && report.disable_order)
 		report.tool.ordered_events = false;
 
 	if (quiet)
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 69c769b04a61..954ce2f594e9 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1712,7 +1712,7 @@ static int perf_sched__process_fork_event(struct perf_tool *tool,
 {
 	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
 
-	/* run the fork event through the perf machineruy */
+	/* run the fork event through the perf machinery */
 	perf_event__process_fork(tool, event, sample, machine);
 
 	/* and then run additional processing needed for this command */
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 5915f19cee55..1280cbfad4db 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -314,8 +314,7 @@ static inline struct evsel_script *evsel_script(struct evsel *evsel)
 	return (struct evsel_script *)evsel->priv;
 }
 
-static struct evsel_script *perf_evsel_script__new(struct evsel *evsel,
-							struct perf_data *data)
+static struct evsel_script *evsel_script__new(struct evsel *evsel, struct perf_data *data)
 {
 	struct evsel_script *es = zalloc(sizeof(*es));
 
@@ -335,7 +334,7 @@ out_free:
 	return NULL;
 }
 
-static void perf_evsel_script__delete(struct evsel_script *es)
+static void evsel_script__delete(struct evsel_script *es)
 {
 	zfree(&es->filename);
 	fclose(es->fp);
@@ -343,7 +342,7 @@ static void perf_evsel_script__delete(struct evsel_script *es)
 	free(es);
 }
 
-static int perf_evsel_script__fprintf(struct evsel_script *es, FILE *fp)
+static int evsel_script__fprintf(struct evsel_script *es, FILE *fp)
 {
 	struct stat st;
 
@@ -2219,8 +2218,7 @@ static int process_attr(struct perf_tool *tool, union perf_event *event,
 
 	if (!evsel->priv) {
 		if (scr->per_event_dump) {
-			evsel->priv = perf_evsel_script__new(evsel,
-						scr->session->data);
+			evsel->priv = evsel_script__new(evsel, scr->session->data);
 		} else {
 			es = zalloc(sizeof(*es));
 			if (!es)
@@ -2475,7 +2473,7 @@ static void perf_script__fclose_per_event_dump(struct perf_script *script)
 	evlist__for_each_entry(evlist, evsel) {
 		if (!evsel->priv)
 			break;
-		perf_evsel_script__delete(evsel->priv);
+		evsel_script__delete(evsel->priv);
 		evsel->priv = NULL;
 	}
 }
@@ -2488,14 +2486,14 @@ static int perf_script__fopen_per_event_dump(struct perf_script *script)
 		/*
 		 * Already setup? I.e. we may be called twice in cases like
 		 * Intel PT, one for the intel_pt// and dummy events, then
-		 * for the evsels syntheized from the auxtrace info.
+		 * for the evsels synthesized from the auxtrace info.
 		 *
 		 * Ses perf_script__process_auxtrace_info.
 		 */
 		if (evsel->priv != NULL)
 			continue;
 
-		evsel->priv = perf_evsel_script__new(evsel, script->session->data);
+		evsel->priv = evsel_script__new(evsel, script->session->data);
 		if (evsel->priv == NULL)
 			goto out_err_fclose;
 	}
@@ -2530,8 +2528,8 @@ static void perf_script__exit_per_event_dump_stats(struct perf_script *script)
 	evlist__for_each_entry(script->session->evlist, evsel) {
 		struct evsel_script *es = evsel->priv;
 
-		perf_evsel_script__fprintf(es, stdout);
-		perf_evsel_script__delete(es);
+		evsel_script__fprintf(es, stdout);
+		evsel_script__delete(es);
 		evsel->priv = NULL;
 	}
 }
@@ -3085,7 +3083,7 @@ static int list_available_scripts(const struct option *opt __maybe_unused,
  *
  * Fixme: All existing "xxx-record" are all in good formats "-e event ",
  * which is covered well now. And new parsing code should be added to
- * cover the future complexing formats like event groups etc.
+ * cover the future complex formats like event groups etc.
  */
 static int check_ev_match(char *dir_name, char *scriptname,
 			struct perf_session *session)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2e2e4a8345ea..5a830ae09418 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -48,6 +48,7 @@
 #include "util/pmu.h"
 #include "util/event.h"
 #include "util/evlist.h"
+#include "util/evlist-hybrid.h"
 #include "util/evsel.h"
 #include "util/debug.h"
 #include "util/color.h"
@@ -68,6 +69,8 @@
 #include "util/affinity.h"
 #include "util/pfm.h"
 #include "util/bpf_counter.h"
+#include "util/iostat.h"
+#include "util/pmu-hybrid.h"
 #include "asm/bug.h"
 
 #include <linux/time64.h>
@@ -160,6 +163,7 @@ static const char *smi_cost_attrs = {
 };
 
 static struct evlist	*evsel_list;
+static bool all_counters_use_bpf = true;
 
 static struct target target = {
 	.uid	= UINT_MAX,
@@ -212,7 +216,8 @@ static struct perf_stat_config stat_config = {
 	.walltime_nsecs_stats	= &walltime_nsecs_stats,
 	.big_num		= true,
 	.ctl_fd			= -1,
-	.ctl_fd_ack		= -1
+	.ctl_fd_ack		= -1,
+	.iostat_run		= false,
 };
 
 static bool cpus_map_matched(struct evsel *a, struct evsel *b)
@@ -239,6 +244,9 @@ static void evlist__check_cpu_maps(struct evlist *evlist)
 	struct evsel *evsel, *pos, *leader;
 	char buf[1024];
 
+	if (evlist__has_hybrid(evlist))
+		evlist__warn_hybrid_group(evlist);
+
 	evlist__for_each_entry(evlist, evsel) {
 		leader = evsel->leader;
 
@@ -399,6 +407,9 @@ static int read_affinity_counters(struct timespec *rs)
 	struct affinity affinity;
 	int i, ncpus, cpu;
 
+	if (all_counters_use_bpf)
+		return 0;
+
 	if (affinity__setup(&affinity) < 0)
 		return -1;
 
@@ -413,6 +424,8 @@ static int read_affinity_counters(struct timespec *rs)
 		evlist__for_each_entry(evsel_list, counter) {
 			if (evsel__cpu_iter_skip(counter, cpu))
 				continue;
+			if (evsel__is_bpf(counter))
+				continue;
 			if (!counter->err) {
 				counter->err = read_counter_cpu(counter, rs,
 								counter->cpu_iter - 1);
@@ -429,6 +442,9 @@ static int read_bpf_map_counters(void)
 	int err;
 
 	evlist__for_each_entry(evsel_list, counter) {
+		if (!evsel__is_bpf(counter))
+			continue;
+
 		err = bpf_counter__read(counter);
 		if (err)
 			return err;
@@ -439,14 +455,10 @@ static int read_bpf_map_counters(void)
 static void read_counters(struct timespec *rs)
 {
 	struct evsel *counter;
-	int err;
 
 	if (!stat_config.stop_read_counter) {
-		if (target__has_bpf(&target))
-			err = read_bpf_map_counters();
-		else
-			err = read_affinity_counters(rs);
-		if (err < 0)
+		if (read_bpf_map_counters() ||
+		    read_affinity_counters(rs))
 			return;
 	}
 
@@ -535,12 +547,13 @@ static int enable_counters(void)
 	struct evsel *evsel;
 	int err;
 
-	if (target__has_bpf(&target)) {
-		evlist__for_each_entry(evsel_list, evsel) {
-			err = bpf_counter__enable(evsel);
-			if (err)
-				return err;
-		}
+	evlist__for_each_entry(evsel_list, evsel) {
+		if (!evsel__is_bpf(evsel))
+			continue;
+
+		err = bpf_counter__enable(evsel);
+		if (err)
+			return err;
 	}
 
 	if (stat_config.initial_delay < 0) {
@@ -784,14 +797,20 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 	if (affinity__setup(&affinity) < 0)
 		return -1;
 
-	if (target__has_bpf(&target)) {
-		evlist__for_each_entry(evsel_list, counter) {
-			if (bpf_counter__load(counter, &target))
-				return -1;
-		}
+	evlist__for_each_entry(evsel_list, counter) {
+		if (bpf_counter__load(counter, &target))
+			return -1;
+		if (!evsel__is_bpf(counter))
+			all_counters_use_bpf = false;
 	}
 
 	evlist__for_each_cpu (evsel_list, i, cpu) {
+		/*
+		 * bperf calls evsel__open_per_cpu() in bperf__load(), so
+		 * no need to call it again here.
+		 */
+		if (target.use_bpf)
+			break;
 		affinity__set(&affinity, cpu);
 
 		evlist__for_each_entry(evsel_list, counter) {
@@ -799,6 +818,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
 				continue;
 			if (counter->reset_group || counter->errored)
 				continue;
+			if (evsel__is_bpf(counter))
+				continue;
 try_again:
 			if (create_perf_stat_counter(counter, &stat_config, &target,
 						     counter->cpu_iter - 1) < 0) {
@@ -925,15 +946,15 @@ try_again_reset:
 	/*
 	 * Enable counters and exec the command:
 	 */
-	t0 = rdclock();
-	clock_gettime(CLOCK_MONOTONIC, &ref_time);
-
 	if (forks) {
 		evlist__start_workload(evsel_list);
 		err = enable_counters();
 		if (err)
 			return -1;
 
+		t0 = rdclock();
+		clock_gettime(CLOCK_MONOTONIC, &ref_time);
+
 		if (interval || timeout || evlist__ctlfd_initialized(evsel_list))
 			status = dispatch_events(forks, timeout, interval, &times);
 		if (child_pid != -1) {
@@ -954,6 +975,10 @@ try_again_reset:
 		err = enable_counters();
 		if (err)
 			return -1;
+
+		t0 = rdclock();
+		clock_gettime(CLOCK_MONOTONIC, &ref_time);
+
 		status = dispatch_events(forks, timeout, interval, &times);
 	}
 
@@ -1083,6 +1108,11 @@ void perf_stat__set_big_num(int set)
 	stat_config.big_num = (set != 0);
 }
 
+void perf_stat__set_no_csv_summary(int set)
+{
+	stat_config.no_csv_summary = (set != 0);
+}
+
 static int stat__set_big_num(const struct option *opt __maybe_unused,
 			     const char *s __maybe_unused, int unset)
 {
@@ -1146,6 +1176,10 @@ static struct option stat_options[] = {
 #ifdef HAVE_BPF_SKEL
 	OPT_STRING('b', "bpf-prog", &target.bpf_str, "bpf-prog-id",
 		   "stat events on existing bpf program id"),
+	OPT_BOOLEAN(0, "bpf-counters", &target.use_bpf,
+		    "use bpf program to count events"),
+	OPT_STRING(0, "bpf-attr-map", &target.attr_map, "attr-map-path",
+		   "path to perf_event_attr map"),
 #endif
 	OPT_BOOLEAN('a', "all-cpus", &target.system_wide,
 		    "system-wide collection from all CPUs"),
@@ -1235,6 +1269,8 @@ static struct option stat_options[] = {
 		    "threads of same physical core"),
 	OPT_BOOLEAN(0, "summary", &stat_config.summary,
 		       "print summary for interval mode"),
+	OPT_BOOLEAN(0, "no-csv-summary", &stat_config.no_csv_summary,
+		       "don't print 'summary' for CSV summary output"),
 	OPT_BOOLEAN(0, "quiet", &stat_config.quiet,
 			"don't print output (useful with record)"),
 #ifdef HAVE_LIBPFM
@@ -1247,6 +1283,9 @@ static struct option stat_options[] = {
 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
 		      parse_control_option),
+	OPT_CALLBACK_OPTARG(0, "iostat", &evsel_list, &stat_config, "default",
+			    "measure I/O performance metrics provided by arch/platform",
+			    iostat_parse),
 	OPT_END()
 };
 
@@ -1605,6 +1644,12 @@ static int add_default_attributes(void)
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES		},
 
 };
+	struct perf_event_attr default_sw_attrs[] = {
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK		},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS		},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS		},
+};
 
 /*
  * Detailed stats (-d), covering the L1 and last level data caches:
@@ -1705,7 +1750,7 @@ static int add_default_attributes(void)
 	bzero(&errinfo, sizeof(errinfo));
 	if (transaction_run) {
 		/* Handle -T as -M transaction. Once platform specific metrics
-		 * support has been added to the json files, all archictures
+		 * support has been added to the json files, all architectures
 		 * will use this approach. To determine transaction support
 		 * on an architecture test for such a metric name.
 		 */
@@ -1841,6 +1886,28 @@ setup_metrics:
 	}
 
 	if (!evsel_list->core.nr_entries) {
+		if (perf_pmu__has_hybrid()) {
+			const char *hybrid_str = "cycles,instructions,branches,branch-misses";
+
+			if (target__has_cpu(&target))
+				default_sw_attrs[0].config = PERF_COUNT_SW_CPU_CLOCK;
+
+			if (evlist__add_default_attrs(evsel_list,
+						      default_sw_attrs) < 0) {
+				return -1;
+			}
+
+			err = parse_events(evsel_list, hybrid_str, &errinfo);
+			if (err) {
+				fprintf(stderr,
+					"Cannot set up hybrid events %s: %d\n",
+					hybrid_str, err);
+				parse_events_print_error(&errinfo, hybrid_str);
+				return -1;
+			}
+			return err;
+		}
+
 		if (target__has_cpu(&target))
 			default_attrs0[0].config = PERF_COUNT_SW_CPU_CLOCK;
 
@@ -2320,6 +2387,17 @@ int cmd_stat(int argc, const char **argv)
 		goto out;
 	}
 
+	if (stat_config.iostat_run) {
+		status = iostat_prepare(evsel_list, &stat_config);
+		if (status)
+			goto out;
+		if (iostat_mode == IOSTAT_LIST) {
+			iostat_list(evsel_list, &stat_config);
+			goto out;
+		} else if (verbose)
+			iostat_list(evsel_list, &stat_config);
+	}
+
 	if (add_default_attributes())
 		goto out;
 
@@ -2357,6 +2435,9 @@ int cmd_stat(int argc, const char **argv)
 
 	evlist__check_cpu_maps(evsel_list);
 
+	if (perf_pmu__has_hybrid())
+		stat_config.no_merge = true;
+
 	/*
 	 * Initialize thread_map with comm names,
 	 * so we could print it out on output.
@@ -2459,7 +2540,7 @@ int cmd_stat(int argc, const char **argv)
 		/*
 		 * We synthesize the kernel mmap record just so that older tools
 		 * don't emit warnings about not being able to resolve symbols
-		 * due to /proc/sys/kernel/kptr_restrict settings and instear provide
+		 * due to /proc/sys/kernel/kptr_restrict settings and instead provide
 		 * a saner message about no samples being in the perf.data file.
 		 *
 		 * This also serves to suppress a warning about f_header.data.size == 0
@@ -2495,6 +2576,9 @@ int cmd_stat(int argc, const char **argv)
 	perf_stat__exit_aggr_mode();
 	evlist__free_stats(evsel_list);
 out:
+	if (stat_config.iostat_run)
+		iostat_release(evsel_list);
+
 	zfree(&stat_config.walltime_run);
 
 	if (smi_cost && smi_reset)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 3673c04d16b6..69cb3635f5ef 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -328,13 +328,13 @@ static void perf_top__print_sym_table(struct perf_top *top)
 	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 
 	if (!top->record_opts.overwrite &&
-	    (hists->stats.nr_lost_warned !=
-	    hists->stats.nr_events[PERF_RECORD_LOST])) {
-		hists->stats.nr_lost_warned =
-			      hists->stats.nr_events[PERF_RECORD_LOST];
+	    (top->evlist->stats.nr_lost_warned !=
+	     top->evlist->stats.nr_events[PERF_RECORD_LOST])) {
+		top->evlist->stats.nr_lost_warned =
+			      top->evlist->stats.nr_events[PERF_RECORD_LOST];
 		color_fprintf(stdout, PERF_COLOR_RED,
 			      "WARNING: LOST %d chunks, Check IO/CPU overload",
-			      hists->stats.nr_lost_warned);
+			      top->evlist->stats.nr_lost_warned);
 		++printed;
 	}
 
@@ -852,11 +852,9 @@ static void
 perf_top__process_lost(struct perf_top *top, union perf_event *event,
 		       struct evsel *evsel)
 {
-	struct hists *hists = evsel__hists(evsel);
-
 	top->lost += event->lost.lost;
 	top->lost_total += event->lost.lost;
-	hists->stats.total_lost += event->lost.lost;
+	evsel->evlist->stats.total_lost += event->lost.lost;
 }
 
 static void
@@ -864,11 +862,9 @@ perf_top__process_lost_samples(struct perf_top *top,
 			       union perf_event *event,
 			       struct evsel *evsel)
 {
-	struct hists *hists = evsel__hists(evsel);
-
 	top->lost += event->lost_samples.lost;
 	top->lost_total += event->lost_samples.lost;
-	hists->stats.total_lost_samples += event->lost_samples.lost;
+	evsel->evlist->stats.total_lost_samples += event->lost_samples.lost;
 }
 
 static u64 last_timestamp;
@@ -1205,7 +1201,7 @@ static int deliver_event(struct ordered_events *qe,
 	} else if (event->header.type == PERF_RECORD_LOST_SAMPLES) {
 		perf_top__process_lost_samples(top, event, evsel);
 	} else if (event->header.type < PERF_RECORD_MAX) {
-		hists__inc_nr_events(evsel__hists(evsel), event->header.type);
+		events_stats__inc(&session->evlist->stats, event->header.type);
 		machine__process_event(machine, event, &sample);
 	} else
 		++session->evlist->stats.nr_unknown_events;
@@ -1607,7 +1603,7 @@ int cmd_top(int argc, const char **argv)
 	if (status) {
 		/*
 		 * Some arches do not provide a get_cpuid(), so just use pr_debug, otherwise
-		 * warn the user explicitely.
+		 * warn the user explicitly.
 		 */
 		eprintf(status == ENOSYS ? 1 : 0, verbose,
 			"Couldn't read the cpuid for this machine: %s\n",
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 07857dfb4d91..dd8ff287e930 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -153,6 +153,7 @@ check lib/ctype.c		      '-I "^EXPORT_SYMBOL" -I "^#include <linux/export.h>" -B
 check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl
 check_2 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl
 check_2 tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl
+check_2 tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl
 
 for i in $BEAUTY_FILES; do
   beauty_check $i -B
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index 825a12e8d694..4aa034aefa33 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -14,6 +14,7 @@ perf-config			mainporcelain common
 perf-evlist			mainporcelain common
 perf-ftrace			mainporcelain common
 perf-inject			mainporcelain common
+perf-iostat			mainporcelain common
 perf-kallsyms			mainporcelain common
 perf-kmem			mainporcelain common
 perf-kvm			mainporcelain common
diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
index b80437971d80..a262dcd020f4 100644
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
@@ -262,7 +262,7 @@ int sys_enter(struct syscall_enter_args *args)
 	/*
 	 * Jump to syscall specific augmenter, even if the default one,
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
-	 * unagmented tracepoint payload.
+	 * unaugmented tracepoint payload.
 	 */
 	bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
 
@@ -282,7 +282,7 @@ int sys_exit(struct syscall_exit_args *args)
 	/*
 	 * Jump to syscall specific return augmenter, even if the default one,
 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
-	 * unagmented tracepoint payload.
+	 * unaugmented tracepoint payload.
 	 */
 	bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
 	/*
diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
index 88108598d6e9..526dcaf9f079 100644
--- a/tools/perf/jvmti/jvmti_agent.c
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -390,7 +390,7 @@ jvmti_write_code(void *agent, char const *sym,
 		rec.p.total_size += size;
 
 	/*
-	 * If JVM is multi-threaded, nultiple concurrent calls to agent
+	 * If JVM is multi-threaded, multiple concurrent calls to agent
 	 * may be possible, so protect file writes
 	 */
 	flockfile(fp);
@@ -457,7 +457,7 @@ jvmti_write_debug_info(void *agent, uint64_t code,
 	rec.p.total_size = size;
 
 	/*
-	 * If JVM is multi-threaded, nultiple concurrent calls to agent
+	 * If JVM is multi-threaded, multiple concurrent calls to agent
 	 * may be possible, so protect file writes
 	 */
 	flockfile(fp);
diff --git a/tools/perf/perf-iostat.sh b/tools/perf/perf-iostat.sh
new file mode 100644
index 000000000000..e562f252d56f
--- /dev/null
+++ b/tools/perf/perf-iostat.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# perf iostat
+# Alexander Antonov <alexander.antonov@linux.intel.com>
+
+if [[ "$1" == "list" ]] || [[ "$1" =~ ([a-f0-9A-F]{1,}):([a-f0-9A-F]{1,2})(,)? ]]; then
+        DELIMITER="="
+else
+        DELIMITER=" "
+fi
+
+perf stat --iostat$DELIMITER$*
diff --git a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
index 75376c7cc072..913fb200ea52 100644
--- a/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
+++ b/tools/perf/pmu-events/arch/arm64/armv8-common-and-microarch.json
@@ -210,12 +210,24 @@
         "BriefDescription": "Attributable Level 2 data TLB refill"
     },
     {
+        "PublicDescription": "Attributable Level 2 instruction TLB refill.",
+        "EventCode": "0x2E",
+        "EventName": "L2I_TLB_REFILL",
+        "BriefDescription": "Attributable Level 2 instruction TLB refill."
+    },
+    {
         "PublicDescription": "Attributable Level 2 data or unified TLB access",
         "EventCode": "0x2F",
         "EventName": "L2D_TLB",
         "BriefDescription": "Attributable Level 2 data or unified TLB access"
     },
     {
+        "PublicDescription": "Attributable Level 2 instruction TLB access.",
+        "EventCode": "0x30",
+        "EventName": "L2I_TLB",
+        "BriefDescription": "Attributable Level 2 instruction TLB access."
+    },
+    {
         "PublicDescription": "Access to another socket in a multi-socket system",
         "EventCode": "0x31",
         "EventName": "REMOTE_ACCESS",
@@ -244,5 +256,221 @@
         "EventCode": "0x37",
         "EventName": "LL_CACHE_MISS_RD",
         "BriefDescription": "Last level cache miss, read"
+    },
+    {
+        "PublicDescription": "SIMD Instruction architecturally executed.",
+        "EventCode": "0x8000",
+        "EventName": "SIMD_INST_RETIRED",
+        "BriefDescription": "SIMD Instruction architecturally executed."
+    },
+    {
+        "PublicDescription": "Instruction architecturally executed, SVE.",
+        "EventCode": "0x8002",
+        "EventName": "SVE_INST_RETIRED",
+        "BriefDescription": "Instruction architecturally executed, SVE."
+    },
+    {
+        "PublicDescription": "Microarchitectural operation, Operations speculatively executed.",
+        "EventCode": "0x8008",
+        "EventName": "UOP_SPEC",
+        "BriefDescription": "Microarchitectural operation, Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE Math accelerator Operations speculatively executed.",
+        "EventCode": "0x800E",
+        "EventName": "SVE_MATH_SPEC",
+        "BriefDescription": "SVE Math accelerator Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point Operations speculatively executed.",
+        "EventCode": "0x8010",
+        "EventName": "FP_SPEC",
+        "BriefDescription": "Floating-point Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point FMA Operations speculatively executed.",
+        "EventCode": "0x8028",
+        "EventName": "FP_FMA_SPEC",
+        "BriefDescription": "Floating-point FMA Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Floating-point reciprocal estimate Operations speculatively executed.",
+        "EventCode": "0x8034",
+        "EventName": "FP_RECPE_SPEC",
+        "BriefDescription": "Floating-point reciprocal estimate Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "floating-point convert Operations speculatively executed.",
+        "EventCode": "0x8038",
+        "EventName": "FP_CVT_SPEC",
+        "BriefDescription": "floating-point convert Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE integer Operations speculatively executed.",
+        "EventCode": "0x8043",
+        "EventName": "ASE_SVE_INT_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE integer Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE predicated Operations speculatively executed.",
+        "EventCode": "0x8074",
+        "EventName": "SVE_PRED_SPEC",
+        "BriefDescription": "SVE predicated Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE MOVPRFX Operations speculatively executed.",
+        "EventCode": "0x807C",
+        "EventName": "SVE_MOVPRFX_SPEC",
+        "BriefDescription": "SVE MOVPRFX Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE MOVPRFX unfused Operations speculatively executed.",
+        "EventCode": "0x807F",
+        "EventName": "SVE_MOVPRFX_U_SPEC",
+        "BriefDescription": "SVE MOVPRFX unfused Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE load Operations speculatively executed.",
+        "EventCode": "0x8085",
+        "EventName": "ASE_SVE_LD_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE store Operations speculatively executed.",
+        "EventCode": "0x8086",
+        "EventName": "ASE_SVE_ST_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Prefetch Operations speculatively executed.",
+        "EventCode": "0x8087",
+        "EventName": "PRF_SPEC",
+        "BriefDescription": "Prefetch Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "General-purpose register load Operations speculatively executed.",
+        "EventCode": "0x8089",
+        "EventName": "BASE_LD_REG_SPEC",
+        "BriefDescription": "General-purpose register load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "General-purpose register store Operations speculatively executed.",
+        "EventCode": "0x808A",
+        "EventName": "BASE_ST_REG_SPEC",
+        "BriefDescription": "General-purpose register store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE unpredicated load register Operations speculatively executed.",
+        "EventCode": "0x8091",
+        "EventName": "SVE_LDR_REG_SPEC",
+        "BriefDescription": "SVE unpredicated load register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE unpredicated store register Operations speculatively executed.",
+        "EventCode": "0x8092",
+        "EventName": "SVE_STR_REG_SPEC",
+        "BriefDescription": "SVE unpredicated store register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE load predicate register Operations speculatively executed.",
+        "EventCode": "0x8095",
+        "EventName": "SVE_LDR_PREG_SPEC",
+        "BriefDescription": "SVE load predicate register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE store predicate register Operations speculatively executed.",
+        "EventCode": "0x8096",
+        "EventName": "SVE_STR_PREG_SPEC",
+        "BriefDescription": "SVE store predicate register Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE contiguous prefetch element Operations speculatively executed.",
+        "EventCode": "0x809F",
+        "EventName": "SVE_PRF_CONTIG_SPEC",
+        "BriefDescription": "SVE contiguous prefetch element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed.",
+        "EventCode": "0x80A5",
+        "EventName": "ASE_SVE_LD_MULTI_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE contiguous load multiple vector Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed.",
+        "EventCode": "0x80A6",
+        "EventName": "ASE_SVE_ST_MULTI_SPEC",
+        "BriefDescription": "Advanced SIMD and SVE contiguous store multiple vector Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE gather-load Operations speculatively executed.",
+        "EventCode": "0x80AD",
+        "EventName": "SVE_LD_GATHER_SPEC",
+        "BriefDescription": "SVE gather-load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE scatter-store Operations speculatively executed.",
+        "EventCode": "0x80AE",
+        "EventName": "SVE_ST_SCATTER_SPEC",
+        "BriefDescription": "SVE scatter-store Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE gather-prefetch Operations speculatively executed.",
+        "EventCode": "0x80AF",
+        "EventName": "SVE_PRF_GATHER_SPEC",
+        "BriefDescription": "SVE gather-prefetch Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "SVE First-fault load Operations speculatively executed.",
+        "EventCode": "0x80BC",
+        "EventName": "SVE_LDFF_SPEC",
+        "BriefDescription": "SVE First-fault load Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C0",
+        "EventName": "FP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C1",
+        "EventName": "FP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable half-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C2",
+        "EventName": "FP_HP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable half-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable half-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C3",
+        "EventName": "FP_HP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable half-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable single-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C4",
+        "EventName": "FP_SP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable single-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable single-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C5",
+        "EventName": "FP_SP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable single-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Scalable double-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C6",
+        "EventName": "FP_DP_SCALE_OPS_SPEC",
+        "BriefDescription": "Scalable double-precision floating-point element Operations speculatively executed."
+    },
+    {
+        "PublicDescription": "Non-scalable double-precision floating-point element Operations speculatively executed.",
+        "EventCode": "0x80C7",
+        "EventName": "FP_DP_FIXED_OPS_SPEC",
+        "BriefDescription": "Non-scalable double-precision floating-point element Operations speculatively executed."
     }
 ]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json
new file mode 100644
index 000000000000..b011af11bf94
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json
@@ -0,0 +1,8 @@
+[
+  {
+    "ArchStdEvent": "BR_MIS_PRED"
+  },
+  {
+    "ArchStdEvent": "BR_PRED"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json
new file mode 100644
index 000000000000..084e88d7df73
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json
@@ -0,0 +1,62 @@
+[
+  {
+    "PublicDescription": "This event counts read transactions from tofu controller to measured CMG.",
+    "EventCode": "0x314",
+    "EventName": "BUS_READ_TOTAL_TOFU",
+    "BriefDescription": "This event counts read transactions from tofu controller to measured CMG."
+  },
+  {
+    "PublicDescription": "This event counts read transactions from PCI controller to measured CMG.",
+    "EventCode": "0x315",
+    "EventName": "BUS_READ_TOTAL_PCI",
+    "BriefDescription": "This event counts read transactions from PCI controller to measured CMG."
+  },
+  {
+    "PublicDescription": "This event counts read transactions from measured CMG local memory to measured CMG.",
+    "EventCode": "0x316",
+    "EventName": "BUS_READ_TOTAL_MEM",
+    "BriefDescription": "This event counts read transactions from measured CMG local memory to measured CMG."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG0, if measured CMG is not CMG0.",
+    "EventCode": "0x318",
+    "EventName": "BUS_WRITE_TOTAL_CMG0",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG0, if measured CMG is not CMG0."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG1, if measured CMG is not CMG1.",
+    "EventCode": "0x319",
+    "EventName": "BUS_WRITE_TOTAL_CMG1",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG1, if measured CMG is not CMG1."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG2, if measured CMG is not CMG2.",
+    "EventCode": "0x31A",
+    "EventName": "BUS_WRITE_TOTAL_CMG2",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG2, if measured CMG is not CMG2."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to CMG3, if measured CMG is not CMG3.",
+    "EventCode": "0x31B",
+    "EventName": "BUS_WRITE_TOTAL_CMG3",
+    "BriefDescription": "This event counts write transactions from measured CMG to CMG3, if measured CMG is not CMG3."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to tofu controller.",
+    "EventCode": "0x31C",
+    "EventName": "BUS_WRITE_TOTAL_TOFU",
+    "BriefDescription": "This event counts write transactions from measured CMG to tofu controller."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to PCI controller.",
+    "EventCode": "0x31D",
+    "EventName": "BUS_WRITE_TOTAL_PCI",
+    "BriefDescription": "This event counts write transactions from measured CMG to PCI controller."
+  },
+  {
+    "PublicDescription": "This event counts write transactions from measured CMG to measured CMG local memory.",
+    "EventCode": "0x31E",
+    "EventName": "BUS_WRITE_TOTAL_MEM",
+    "BriefDescription": "This event counts write transactions from measured CMG to measured CMG local memory."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json
new file mode 100644
index 000000000000..2e341a951a10
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json
@@ -0,0 +1,128 @@
+[
+  {
+    "ArchStdEvent": "L1I_CACHE_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1I_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1D_CACHE_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1D_CACHE"
+  },
+  {
+    "ArchStdEvent": "L1D_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L1I_CACHE"
+  },
+  {
+    "ArchStdEvent": "L1D_CACHE_WB"
+  },
+  {
+    "ArchStdEvent": "L2D_CACHE"
+  },
+  {
+    "ArchStdEvent": "L2D_CACHE_REFILL"
+  },
+  {
+    "ArchStdEvent": "L2D_CACHE_WB"
+  },
+  {
+    "ArchStdEvent": "L2D_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L2I_TLB_REFILL"
+  },
+  {
+    "ArchStdEvent": "L2D_TLB"
+  },
+  {
+    "ArchStdEvent": "L2I_TLB"
+  },
+  {
+    "PublicDescription": "This event counts L1D_CACHE_REFILL caused by software or hardware prefetch.",
+    "EventCode": "0x49",
+    "EventName": "L1D_CACHE_REFILL_PRF",
+    "BriefDescription": "This event counts L1D_CACHE_REFILL caused by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts L2D_CACHE_REFILL caused by software or hardware prefetch.",
+    "EventCode": "0x59",
+    "EventName": "L2D_CACHE_REFILL_PRF",
+    "BriefDescription": "This event counts L2D_CACHE_REFILL caused by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts L1D_CACHE_REFILL caused by demand access.",
+    "EventCode": "0x200",
+    "EventName": "L1D_CACHE_REFILL_DM",
+    "BriefDescription": "This event counts L1D_CACHE_REFILL caused by demand access."
+  },
+  {
+    "PublicDescription": "This event counts L1D_CACHE_REFILL caused by hardware prefetch.",
+    "EventCode": "0x202",
+    "EventName": "L1D_CACHE_REFILL_HWPRF",
+    "BriefDescription": "This event counts L1D_CACHE_REFILL caused by hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts outstanding L1D cache miss requests per cycle.",
+    "EventCode": "0x208",
+    "EventName": "L1_MISS_WAIT",
+    "BriefDescription": "This event counts outstanding L1D cache miss requests per cycle."
+  },
+  {
+    "PublicDescription": "This event counts outstanding L1I cache miss requests per cycle.",
+    "EventCode": "0x209",
+    "EventName": "L1I_MISS_WAIT",
+    "BriefDescription": "This event counts outstanding L1I cache miss requests per cycle."
+  },
+  {
+    "PublicDescription": "This event counts L2D_CACHE_REFILL caused by demand access.",
+    "EventCode": "0x300",
+    "EventName": "L2D_CACHE_REFILL_DM",
+    "BriefDescription": "This event counts L2D_CACHE_REFILL caused by demand access."
+  },
+  {
+    "PublicDescription": "This event counts L2D_CACHE_REFILL caused by hardware prefetch.",
+    "EventCode": "0x302",
+    "EventName": "L2D_CACHE_REFILL_HWPRF",
+    "BriefDescription": "This event counts L2D_CACHE_REFILL caused by hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts outstanding L2 cache miss requests per cycle.",
+    "EventCode": "0x308",
+    "EventName": "L2_MISS_WAIT",
+    "BriefDescription": "This event counts outstanding L2 cache miss requests per cycle."
+  },
+  {
+    "PublicDescription": "This event counts the number of times of L2 cache miss.",
+    "EventCode": "0x309",
+    "EventName": "L2_MISS_COUNT",
+    "BriefDescription": "This event counts the number of times of L2 cache miss."
+  },
+  {
+    "PublicDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch.",
+    "EventCode": "0x325",
+    "EventName": "L2D_SWAP_DM",
+    "BriefDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access.",
+    "EventCode": "0x326",
+    "EventName": "L2D_CACHE_MIBMCH_PRF",
+    "BriefDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access."
+  },
+  {
+    "PublicDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch.",
+    "EventCode": "0x396",
+    "EventName": "L2D_CACHE_SWAP_LOCAL",
+    "BriefDescription": "This event counts operations where demand access hits an L2 cache refill buffer allocated by software or hardware prefetch."
+  },
+  {
+    "PublicDescription": "This event counts energy consumption per cycle of L2 cache.",
+    "EventCode": "0x3E0",
+    "EventName": "EA_L2",
+    "BriefDescription": "This event counts energy consumption per cycle of L2 cache."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json
new file mode 100644
index 000000000000..b16484628290
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json
@@ -0,0 +1,5 @@
+[
+  {
+    "ArchStdEvent": "CPU_CYCLES"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json
new file mode 100644
index 000000000000..348749c154c0
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json
@@ -0,0 +1,29 @@
+[
+  {
+    "ArchStdEvent": "EXC_TAKEN"
+  },
+  {
+    "ArchStdEvent": "EXC_UNDEF"
+  },
+  {
+    "ArchStdEvent": "EXC_SVC"
+  },
+  {
+    "ArchStdEvent": "EXC_PABORT"
+  },
+  {
+    "ArchStdEvent": "EXC_DABORT"
+  },
+  {
+    "ArchStdEvent": "EXC_IRQ"
+  },
+  {
+    "ArchStdEvent": "EXC_FIQ"
+  },
+  {
+    "ArchStdEvent": "EXC_SMC"
+  },
+  {
+    "ArchStdEvent": "EXC_HVC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json
new file mode 100644
index 000000000000..6d258b1080cf
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json
@@ -0,0 +1,131 @@
+[
+  {
+    "ArchStdEvent": "SW_INCR"
+  },
+  {
+    "ArchStdEvent": "INST_RETIRED"
+  },
+  {
+    "ArchStdEvent": "EXC_RETURN"
+  },
+  {
+    "ArchStdEvent": "CID_WRITE_RETIRED"
+  },
+  {
+    "ArchStdEvent": "INST_SPEC"
+  },
+  {
+    "ArchStdEvent": "LDREX_SPEC"
+  },
+  {
+    "ArchStdEvent": "STREX_SPEC"
+  },
+  {
+    "ArchStdEvent": "LD_SPEC"
+  },
+  {
+    "ArchStdEvent": "ST_SPEC"
+  },
+  {
+    "ArchStdEvent": "LDST_SPEC"
+  },
+  {
+    "ArchStdEvent": "DP_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SPEC"
+  },
+  {
+    "ArchStdEvent": "VFP_SPEC"
+  },
+  {
+    "ArchStdEvent": "PC_WRITE_SPEC"
+  },
+  {
+    "ArchStdEvent": "CRYPTO_SPEC"
+  },
+  {
+    "ArchStdEvent": "BR_IMMED_SPEC"
+  },
+  {
+    "ArchStdEvent": "BR_RETURN_SPEC"
+  },
+  {
+    "ArchStdEvent": "BR_INDIRECT_SPEC"
+  },
+  {
+    "ArchStdEvent": "ISB_SPEC"
+  },
+  {
+    "ArchStdEvent": "DSB_SPEC"
+  },
+  {
+    "ArchStdEvent": "DMB_SPEC"
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed zero blocking operations due to the 'DC ZVA' instruction.",
+    "EventCode": "0x9F",
+    "EventName": "DCZVA_SPEC",
+    "BriefDescription": "This event counts architecturally executed zero blocking operations due to the 'DC ZVA' instruction."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed floating-point move operations.",
+    "EventCode": "0x105",
+    "EventName": "FP_MV_SPEC",
+    "BriefDescription": "This event counts architecturally executed floating-point move operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed operations that using predicate register.",
+    "EventCode": "0x108",
+    "EventName": "PRD_SPEC",
+    "BriefDescription": "This event counts architecturally executed operations that using predicate register."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed inter-element manipulation operations.",
+    "EventCode": "0x109",
+    "EventName": "IEL_SPEC",
+    "BriefDescription": "This event counts architecturally executed inter-element manipulation operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed inter-register manipulation operations.",
+    "EventCode": "0x10A",
+    "EventName": "IREG_SPEC",
+    "BriefDescription": "This event counts architecturally executed inter-register manipulation operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed NOSIMD load operations that using SIMD&FP registers.",
+    "EventCode": "0x112",
+    "EventName": "FP_LD_SPEC",
+    "BriefDescription": "This event counts architecturally executed NOSIMD load operations that using SIMD&FP registers."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed NOSIMD store operations that using SIMD&FP registers.",
+    "EventCode": "0x113",
+    "EventName": "FP_ST_SPEC",
+    "BriefDescription": "This event counts architecturally executed NOSIMD store operations that using SIMD&FP registers."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed SIMD broadcast floating-point load operations.",
+    "EventCode": "0x11A",
+    "EventName": "BC_LD_SPEC",
+    "BriefDescription": "This event counts architecturally executed SIMD broadcast floating-point load operations."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed instructions, excluding the MOVPRFX instruction.",
+    "EventCode": "0x121",
+    "EventName": "EFFECTIVE_INST_SPEC",
+    "BriefDescription": "This event counts architecturally executed instructions, excluding the MOVPRFX instruction."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed operations that uses 'pre-index' as its addressing mode.",
+    "EventCode": "0x123",
+    "EventName": "PRE_INDEX_SPEC",
+    "BriefDescription": "This event counts architecturally executed operations that uses 'pre-index' as its addressing mode."
+  },
+  {
+    "PublicDescription": "This event counts architecturally executed operations that uses 'post-index' as its addressing mode.",
+    "EventCode": "0x124",
+    "EventName": "POST_INDEX_SPEC",
+    "BriefDescription": "This event counts architecturally executed operations that uses 'post-index' as its addressing mode."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json
new file mode 100644
index 000000000000..c1f6479e92b4
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json
@@ -0,0 +1,8 @@
+[
+  {
+    "PublicDescription": "This event counts energy consumption per cycle of CMG local memory.",
+    "EventCode": "0x3E8",
+    "EventName": "EA_MEMORY",
+    "BriefDescription": "This event counts energy consumption per cycle of CMG local memory."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json
new file mode 100644
index 000000000000..10c823ac26cc
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json
@@ -0,0 +1,188 @@
+[
+  {
+    "PublicDescription": "This event counts the occurrence count of the micro-operation split.",
+    "EventCode": "0x139",
+    "EventName": "UOP_SPLIT",
+    "BriefDescription": "This event counts the occurrence count of the micro-operation split."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no operation was committed because the oldest and uncommitted load/store/prefetch operation waits for memory access.",
+    "EventCode": "0x180",
+    "EventName": "LD_COMP_WAIT_L2_MISS",
+    "BriefDescription": "This event counts every cycle that no operation was committed because the oldest and uncommitted load/store/prefetch operation waits for memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for memory access.",
+    "EventCode": "0x181",
+    "EventName": "LD_COMP_WAIT_L2_MISS_EX",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L2 cache access.",
+    "EventCode": "0x182",
+    "EventName": "LD_COMP_WAIT_L1_MISS",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L2 cache access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L2 cache access.",
+    "EventCode": "0x183",
+    "EventName": "LD_COMP_WAIT_L1_MISS_EX",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L2 cache access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L1D cache, L2 cache and memory access.",
+    "EventCode": "0x184",
+    "EventName": "LD_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted load/store/prefetch operation waits for L1D cache, L2 cache and memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L1D cache, L2 cache and memory access.",
+    "EventCode": "0x185",
+    "EventName": "LD_COMP_WAIT_EX",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the oldest and uncommitted integer load operation waits for L1D cache, L2 cache and memory access."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed due to the lack of an available prefetch port.",
+    "EventCode": "0x186",
+    "EventName": "LD_COMP_WAIT_PFP_BUSY",
+    "BriefDescription": "This event counts every cycle that no instruction was committed due to the lack of an available prefetch port."
+  },
+  {
+    "PublicDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by an integer load operation.",
+    "EventCode": "0x187",
+    "EventName": "LD_COMP_WAIT_PFP_BUSY_EX",
+    "BriefDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by an integer load operation."
+  },
+  {
+    "PublicDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by a software prefetch instruction.",
+    "EventCode": "0x188",
+    "EventName": "LD_COMP_WAIT_PFP_BUSY_SWPF",
+    "BriefDescription": "This event counts the LD_COMP_WAIT_PFP_BUSY caused by a software prefetch instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is an integer or floating-point/SIMD instruction.",
+    "EventCode": "0x189",
+    "EventName": "EU_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is an integer or floating-point/SIMD instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a floating-point/SIMD instruction.",
+    "EventCode": "0x18A",
+    "EventName": "FL_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a floating-point/SIMD instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a branch instruction.",
+    "EventCode": "0x18B",
+    "EventName": "BR_COMP_WAIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed and the oldest and uncommitted instruction is a branch instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the CSE is empty.",
+    "EventCode": "0x18C",
+    "EventName": "ROB_EMPTY",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the CSE is empty."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed because the CSE is empty and the store port (SP) is full.",
+    "EventCode": "0x18D",
+    "EventName": "ROB_EMPTY_STQ_BUSY",
+    "BriefDescription": "This event counts every cycle that no instruction was committed because the CSE is empty and the store port (SP) is full."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that the instruction unit is halted by the WFE/WFI instruction.",
+    "EventCode": "0x18E",
+    "EventName": "WFE_WFI_CYCLE",
+    "BriefDescription": "This event counts every cycle that the instruction unit is halted by the WFE/WFI instruction."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that no instruction was committed, but counts at the time when commits MOVPRFX only.",
+    "EventCode": "0x190",
+    "EventName": "_0INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that no instruction was committed, but counts at the time when commits MOVPRFX only."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that one instruction is committed.",
+    "EventCode": "0x191",
+    "EventName": "_1INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that one instruction is committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that two instructions are committed.",
+    "EventCode": "0x192",
+    "EventName": "_2INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that two instructions are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that three instructions are committed.",
+    "EventCode": "0x193",
+    "EventName": "_3INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that three instructions are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that four instructions are committed.",
+    "EventCode": "0x194",
+    "EventName": "_4INST_COMMIT",
+    "BriefDescription": "This event counts every cycle that four instructions are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that only any micro-operations are committed.",
+    "EventCode": "0x198",
+    "EventName": "UOP_ONLY_COMMIT",
+    "BriefDescription": "This event counts every cycle that only any micro-operations are committed."
+  },
+  {
+    "PublicDescription": "This event counts every cycle that only the MOVPRFX instruction is committed.",
+    "EventCode": "0x199",
+    "EventName": "SINGLE_MOVPRFX_COMMIT",
+    "BriefDescription": "This event counts every cycle that only the MOVPRFX instruction is committed."
+  },
+  {
+    "PublicDescription": "This event counts energy consumption per cycle of core.",
+    "EventCode": "0x1E0",
+    "EventName": "EA_CORE",
+    "BriefDescription": "This event counts energy consumption per cycle of core."
+  },
+  {
+    "PublicDescription": "This event counts streaming prefetch requests to L1D cache generated by hardware prefetcher.",
+    "EventCode": "0x230",
+    "EventName": "L1HWPF_STREAM_PF",
+    "BriefDescription": "This event counts streaming prefetch requests to L1D cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts allocation type prefetch injection requests to L1D cache generated by hardware prefetcher.",
+    "EventCode": "0x231",
+    "EventName": "L1HWPF_INJ_ALLOC_PF",
+    "BriefDescription": "This event counts allocation type prefetch injection requests to L1D cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts non-allocation type prefetch injection requests to L1D cache generated by hardware prefetcher.",
+    "EventCode": "0x232",
+    "EventName": "L1HWPF_INJ_NOALLOC_PF",
+    "BriefDescription": "This event counts non-allocation type prefetch injection requests to L1D cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts streaming prefetch requests to L2 cache generated by hardware prefecher.",
+    "EventCode": "0x233",
+    "EventName": "L2HWPF_STREAM_PF",
+    "BriefDescription": "This event counts streaming prefetch requests to L2 cache generated by hardware prefecher."
+  },
+  {
+    "PublicDescription": "This event counts allocation type prefetch injection requests to L2 cache generated by hardware prefetcher.",
+    "EventCode": "0x234",
+    "EventName": "L2HWPF_INJ_ALLOC_PF",
+    "BriefDescription": "This event counts allocation type prefetch injection requests to L2 cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts non-allocation type prefetch injection requests to L2 cache generated by hardware prefetcher.",
+    "EventCode": "0x235",
+    "EventName": "L2HWPF_INJ_NOALLOC_PF",
+    "BriefDescription": "This event counts non-allocation type prefetch injection requests to L2 cache generated by hardware prefetcher."
+  },
+  {
+    "PublicDescription": "This event counts prefetch requests to L2 cache generated by the other causes.",
+    "EventCode": "0x236",
+    "EventName": "L2HWPF_OTHER",
+    "BriefDescription": "This event counts prefetch requests to L2 cache generated by the other causes."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json
new file mode 100644
index 000000000000..dd7c97a9972b
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json
@@ -0,0 +1,194 @@
+[
+  {
+    "ArchStdEvent": "STALL_FRONTEND"
+  },
+  {
+    "ArchStdEvent": "STALL_BACKEND"
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EAGA pipeline.",
+    "EventCode": "0x1A0",
+    "EventName": "EAGA_VAL",
+    "BriefDescription": "This event counts valid cycles of EAGA pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EAGB pipeline.",
+    "EventCode": "0x1A1",
+    "EventName": "EAGB_VAL",
+    "BriefDescription": "This event counts valid cycles of EAGB pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EXA pipeline.",
+    "EventCode": "0x1A2",
+    "EventName": "EXA_VAL",
+    "BriefDescription": "This event counts valid cycles of EXA pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of EXB pipeline.",
+    "EventCode": "0x1A3",
+    "EventName": "EXB_VAL",
+    "BriefDescription": "This event counts valid cycles of EXB pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of FLA pipeline.",
+    "EventCode": "0x1A4",
+    "EventName": "FLA_VAL",
+    "BriefDescription": "This event counts valid cycles of FLA pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of FLB pipeline.",
+    "EventCode": "0x1A5",
+    "EventName": "FLB_VAL",
+    "BriefDescription": "This event counts valid cycles of FLB pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of PRX pipeline.",
+    "EventCode": "0x1A6",
+    "EventName": "PRX_VAL",
+    "BriefDescription": "This event counts valid cycles of PRX pipeline."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in FLA pipeline, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x1B4",
+    "EventName": "FLA_VAL_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in FLA pipeline, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in FLB pipeline, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x1B5",
+    "EventName": "FLB_VAL_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in FLB pipeline, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L1D cache pipeline#0.",
+    "EventCode": "0x240",
+    "EventName": "L1_PIPE0_VAL",
+    "BriefDescription": "This event counts valid cycles of L1D cache pipeline#0."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L1D cache pipeline#1.",
+    "EventCode": "0x241",
+    "EventName": "L1_PIPE1_VAL",
+    "BriefDescription": "This event counts valid cycles of L1D cache pipeline#1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its sce bit of tagged address is 1.",
+    "EventCode": "0x250",
+    "EventName": "L1_PIPE0_VAL_IU_TAG_ADRS_SCE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its sce bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its pfe bit of tagged address is 1.",
+    "EventCode": "0x251",
+    "EventName": "L1_PIPE0_VAL_IU_TAG_ADRS_PFE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its pfe bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its sce bit of tagged address is 1.",
+    "EventCode": "0x252",
+    "EventName": "L1_PIPE1_VAL_IU_TAG_ADRS_SCE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its sce bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its pfe bit of tagged address is 1.",
+    "EventCode": "0x253",
+    "EventName": "L1_PIPE1_VAL_IU_TAG_ADRS_PFE",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its pfe bit of tagged address is 1."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L1D cache pipeline#0.",
+    "EventCode": "0x260",
+    "EventName": "L1_PIPE0_COMP",
+    "BriefDescription": "This event counts completed requests in L1D cache pipeline#0."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L1D cache pipeline#1.",
+    "EventCode": "0x261",
+    "EventName": "L1_PIPE1_COMP",
+    "BriefDescription": "This event counts completed requests in L1D cache pipeline#1."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L1I cache pipeline.",
+    "EventCode": "0x268",
+    "EventName": "L1I_PIPE_COMP",
+    "BriefDescription": "This event counts completed requests in L1I cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L1I cache pipeline.",
+    "EventCode": "0x269",
+    "EventName": "L1I_PIPE_VAL",
+    "BriefDescription": "This event counts valid cycles of L1I cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts aborted requests in L1D pipelines that due to store-load interlock.",
+    "EventCode": "0x274",
+    "EventName": "L1_PIPE_ABORT_STLD_INTLK",
+    "BriefDescription": "This event counts aborted requests in L1D pipelines that due to store-load interlock."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#0 that its sector cache ID is not 0.",
+    "EventCode": "0x2A0",
+    "EventName": "L1_PIPE0_VAL_IU_NOT_SEC0",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#0 that its sector cache ID is not 0."
+  },
+  {
+    "PublicDescription": "This event counts requests in L1D cache pipeline#1 that its sector cache ID is not 0.",
+    "EventCode": "0x2A1",
+    "EventName": "L1_PIPE1_VAL_IU_NOT_SEC0",
+    "BriefDescription": "This event counts requests in L1D cache pipeline#1 that its sector cache ID is not 0."
+  },
+  {
+    "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 2 flows because 2 elements could not be combined.",
+    "EventCode": "0x2B0",
+    "EventName": "L1_PIPE_COMP_GATHER_2FLOW",
+    "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 2 flows because 2 elements could not be combined."
+  },
+  {
+    "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 1 flow because 2 elements could be combined.",
+    "EventCode": "0x2B1",
+    "EventName": "L1_PIPE_COMP_GATHER_1FLOW",
+    "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 1 flow because 2 elements could be combined."
+  },
+  {
+    "PublicDescription": "This event counts the number of times where 2 elements of the gather instructions became 0 flow because both predicate values are 0.",
+    "EventCode": "0x2B2",
+    "EventName": "L1_PIPE_COMP_GATHER_0FLOW",
+    "BriefDescription": "This event counts the number of times where 2 elements of the gather instructions became 0 flow because both predicate values are 0."
+  },
+  {
+    "PublicDescription": "This event counts the number of flows of the scatter instructions.",
+    "EventCode": "0x2B3",
+    "EventName": "L1_PIPE_COMP_SCATTER_1FLOW",
+    "BriefDescription": "This event counts the number of flows of the scatter instructions."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#0, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x2B8",
+    "EventName": "L1_PIPE0_COMP_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#0, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#1, where it is corrected so that it becomes 16 when all bits are 1.",
+    "EventCode": "0x2B9",
+    "EventName": "L1_PIPE1_COMP_PRD_CNT",
+    "BriefDescription": "This event counts the number of 1's in the predicate bits of request in L1D cache pipeline#1, where it is corrected so that it becomes 16 when all bits are 1."
+  },
+  {
+    "PublicDescription": "This event counts valid cycles of L2 cache pipeline.",
+    "EventCode": "0x330",
+    "EventName": "L2_PIPE_VAL",
+    "BriefDescription": "This event counts valid cycles of L2 cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts completed requests in L2 cache pipeline.",
+    "EventCode": "0x350",
+    "EventName": "L2_PIPE_COMP_ALL",
+    "BriefDescription": "This event counts completed requests in L2 cache pipeline."
+  },
+  {
+    "PublicDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access.",
+    "EventCode": "0x370",
+    "EventName": "L2_PIPE_COMP_PF_L2MIB_MCH",
+    "BriefDescription": "This event counts operations where software or hardware prefetch hits an L2 cache refill buffer allocated by demand access."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json
new file mode 100644
index 000000000000..dc1b95e42c32
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json
@@ -0,0 +1,110 @@
+[
+  {
+    "ArchStdEvent": "SIMD_INST_RETIRED"
+  },
+  {
+    "ArchStdEvent": "SVE_INST_RETIRED"
+  },
+  {
+    "ArchStdEvent": "UOP_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_MATH_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_FMA_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_RECPE_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_CVT_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_INT_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_PRED_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_MOVPRFX_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_MOVPRFX_U_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_LD_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_ST_SPEC"
+  },
+  {
+    "ArchStdEvent": "PRF_SPEC"
+  },
+  {
+    "ArchStdEvent": "BASE_LD_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "BASE_ST_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LDR_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_STR_REG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LDR_PREG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_STR_PREG_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_PRF_CONTIG_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_LD_MULTI_SPEC"
+  },
+  {
+    "ArchStdEvent": "ASE_SVE_ST_MULTI_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LD_GATHER_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_ST_SCATTER_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_PRF_GATHER_SPEC"
+  },
+  {
+    "ArchStdEvent": "SVE_LDFF_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_FIXED_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_HP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_HP_FIXED_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_SP_FIXED_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_DP_SCALE_OPS_SPEC"
+  },
+  {
+    "ArchStdEvent": "FP_DP_FIXED_OPS_SPEC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json
new file mode 100644
index 000000000000..dda8e59149d2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json
@@ -0,0 +1,233 @@
+[
+    {
+        "MetricExpr": "FETCH_BUBBLE / (4 * CPU_CYCLES)",
+        "PublicDescription": "Frontend bound L1 topdown metric",
+        "BriefDescription": "Frontend bound L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "frontend_bound"
+    },
+    {
+        "MetricExpr": "(INST_SPEC - INST_RETIRED) / (4 * CPU_CYCLES)",
+        "PublicDescription": "Bad Speculation L1 topdown metric",
+        "BriefDescription": "Bad Speculation L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "bad_speculation"
+    },
+    {
+        "MetricExpr": "INST_RETIRED / (CPU_CYCLES * 4)",
+        "PublicDescription": "Retiring L1 topdown metric",
+        "BriefDescription": "Retiring L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "retiring"
+    },
+    {
+        "MetricExpr": "1 - (frontend_bound + bad_speculation + retiring)",
+        "PublicDescription": "Backend Bound L1 topdown metric",
+        "BriefDescription": "Backend Bound L1 topdown metric",
+        "MetricGroup": "TopDownL1",
+        "MetricName": "backend_bound"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x201d@ / CPU_CYCLES",
+        "PublicDescription": "Fetch latency bound L2 topdown metric",
+        "BriefDescription": "Fetch latency bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "fetch_latency_bound"
+    },
+    {
+        "MetricExpr": "frontend_bound - fetch_latency_bound",
+        "PublicDescription": "Fetch bandwidth bound L2 topdown metric",
+        "BriefDescription": "Fetch bandwidth bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "fetch_bandwidth_bound"
+    },
+    {
+        "MetricExpr": "(bad_speculation * BR_MIS_PRED) / (BR_MIS_PRED + armv8_pmuv3_0@event\\=0x2013@)",
+        "PublicDescription": "Branch mispredicts L2 topdown metric",
+        "BriefDescription": "Branch mispredicts L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "branch_mispredicts"
+    },
+    {
+        "MetricExpr": "bad_speculation - branch_mispredicts",
+        "PublicDescription": "Machine clears L2 topdown metric",
+        "BriefDescription": "Machine clears L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "machine_clears"
+    },
+    {
+        "MetricExpr": "(EXE_STALL_CYCLE - (MEM_STALL_ANYLOAD + armv8_pmuv3_0@event\\=0x7005@)) / CPU_CYCLES",
+        "PublicDescription": "Core bound L2 topdown metric",
+        "BriefDescription": "Core bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "core_bound"
+    },
+    {
+        "MetricExpr": "(MEM_STALL_ANYLOAD + armv8_pmuv3_0@event\\=0x7005@) / CPU_CYCLES",
+        "PublicDescription": "Memory bound L2 topdown metric",
+        "BriefDescription": "Memory bound L2 topdown metric",
+        "MetricGroup": "TopDownL2",
+        "MetricName": "memory_bound"
+    },
+    {
+        "MetricExpr": "(((L2I_TLB - L2I_TLB_REFILL) * 15) + (L2I_TLB_REFILL * 100)) / CPU_CYCLES",
+        "PublicDescription": "Idle by itlb miss L3 topdown metric",
+        "BriefDescription": "Idle by itlb miss L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "idle_by_itlb_miss"
+    },
+    {
+        "MetricExpr": "(((L2I_CACHE - L2I_CACHE_REFILL) * 15) + (L2I_CACHE_REFILL * 100)) / CPU_CYCLES",
+        "PublicDescription": "Idle by icache miss L3 topdown metric",
+        "BriefDescription": "Idle by icache miss L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "idle_by_icache_miss"
+    },
+    {
+        "MetricExpr": "(BR_MIS_PRED * 5) / CPU_CYCLES",
+        "PublicDescription": "BP misp flush L3 topdown metric",
+        "BriefDescription": "BP misp flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "bp_misp_flush"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x2013@ * 5) / CPU_CYCLES",
+        "PublicDescription": "OOO flush L3 topdown metric",
+        "BriefDescription": "OOO flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "ooo_flush"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x1001@ * 5) / CPU_CYCLES",
+        "PublicDescription": "Static predictor flush L3 topdown metric",
+        "BriefDescription": "Static predictor flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "sp_flush"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x1010@ / BR_MIS_PRED",
+        "PublicDescription": "Indirect branch L3 topdown metric",
+        "BriefDescription": "Indirect branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "indirect_branch"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x1014@ + armv8_pmuv3_0@event\\=0x1018@) / BR_MIS_PRED",
+        "PublicDescription": "Push branch L3 topdown metric",
+        "BriefDescription": "Push branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "push_branch"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x100c@ / BR_MIS_PRED",
+        "PublicDescription": "Pop branch L3 topdown metric",
+        "BriefDescription": "Pop branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "pop_branch"
+    },
+    {
+        "MetricExpr": "(BR_MIS_PRED - armv8_pmuv3_0@event\\=0x1010@ - armv8_pmuv3_0@event\\=0x1014@ - armv8_pmuv3_0@event\\=0x1018@ - armv8_pmuv3_0@event\\=0x100c@) / BR_MIS_PRED",
+        "PublicDescription": "Other branch L3 topdown metric",
+        "BriefDescription": "Other branch L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "other_branch"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2012@ / armv8_pmuv3_0@event\\=0x2013@",
+        "PublicDescription": "Nuke flush L3 topdown metric",
+        "BriefDescription": "Nuke flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "nuke_flush"
+    },
+    {
+        "MetricExpr": "1 - nuke_flush",
+        "PublicDescription": "Other flush L3 topdown metric",
+        "BriefDescription": "Other flush L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "other_flush"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2010@ / CPU_CYCLES",
+        "PublicDescription": "Sync stall L3 topdown metric",
+        "BriefDescription": "Sync stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "sync_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2004@ / CPU_CYCLES",
+        "PublicDescription": "Rob stall L3 topdown metric",
+        "BriefDescription": "Rob stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "rob_stall"
+    },
+    {
+        "MetricExpr": "(armv8_pmuv3_0@event\\=0x2006@ + armv8_pmuv3_0@event\\=0x2007@ + armv8_pmuv3_0@event\\=0x2008@) / CPU_CYCLES",
+        "PublicDescription": "Ptag stall L3 topdown metric",
+        "BriefDescription": "Ptag stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "ptag_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x201e@ / CPU_CYCLES",
+        "PublicDescription": "SaveOpQ stall L3 topdown metric",
+        "BriefDescription": "SaveOpQ stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "saveopq_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x2005@ / CPU_CYCLES",
+        "PublicDescription": "PC buffer stall L3 topdown metric",
+        "BriefDescription": "PC buffer stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "pc_buffer_stall"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x7002@ / CPU_CYCLES",
+        "PublicDescription": "Divider L3 topdown metric",
+        "BriefDescription": "Divider L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "divider"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x7003@ / CPU_CYCLES",
+        "PublicDescription": "FSU stall L3 topdown metric",
+        "BriefDescription": "FSU stall L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "fsu_stall"
+    },
+    {
+        "MetricExpr": "core_bound - divider - fsu_stall",
+        "PublicDescription": "EXE ports util L3 topdown metric",
+        "BriefDescription": "EXE ports util L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "exe_ports_util"
+    },
+    {
+        "MetricExpr": "(MEM_STALL_ANYLOAD - MEM_STALL_L1MISS) / CPU_CYCLES",
+        "PublicDescription": "L1 bound L3 topdown metric",
+        "BriefDescription": "L1 bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "l1_bound"
+    },
+    {
+        "MetricExpr": "(MEM_STALL_L1MISS - MEM_STALL_L2MISS) / CPU_CYCLES",
+        "PublicDescription": "L2 bound L3 topdown metric",
+        "BriefDescription": "L2 bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "l2_bound"
+    },
+    {
+        "MetricExpr": "MEM_STALL_L2MISS / CPU_CYCLES",
+        "PublicDescription": "Mem bound L3 topdown metric",
+        "BriefDescription": "Mem bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "mem_bound"
+    },
+    {
+        "MetricExpr": "armv8_pmuv3_0@event\\=0x7005@ / CPU_CYCLES",
+        "PublicDescription": "Store bound L3 topdown metric",
+        "BriefDescription": "Store bound L3 topdown metric",
+        "MetricGroup": "TopDownL3",
+        "MetricName": "store_bound"
+    },
+]
diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv
index 0d609149b82a..c43591d831b8 100644
--- a/tools/perf/pmu-events/arch/arm64/mapfile.csv
+++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv
@@ -20,5 +20,6 @@
 0x00000000410fd0c0,v1,arm/cortex-a76-n1,core
 0x00000000420f5160,v1,cavium/thunderx2,core
 0x00000000430f0af0,v1,cavium/thunderx2,core
+0x00000000460f0010,v1,fujitsu/a64fx,core
 0x00000000480fd010,v1,hisilicon/hip08,core
 0x00000000500f0000,v1,ampere/emag,core
diff --git a/tools/perf/pmu-events/arch/powerpc/mapfile.csv b/tools/perf/pmu-events/arch/powerpc/mapfile.csv
index 229150e7ab7d..4abdfc3f9692 100644
--- a/tools/perf/pmu-events/arch/powerpc/mapfile.csv
+++ b/tools/perf/pmu-events/arch/powerpc/mapfile.csv
@@ -15,3 +15,4 @@
 # Power8 entries
 004[bcd][[:xdigit:]]{4},1,power8,core
 004e[[:xdigit:]]{4},1,power9,core
+0080[[:xdigit:]]{4},1,power10,core
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/cache.json b/tools/perf/pmu-events/arch/powerpc/power10/cache.json
new file mode 100644
index 000000000000..616f29098c71
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/cache.json
@@ -0,0 +1,47 @@
+[
+  {
+    "EventCode": "1003C",
+    "EventName": "PM_EXEC_STALL_DMISS_L2L3",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from either the local L2 or local L3."
+  },
+  {
+    "EventCode": "34056",
+    "EventName": "PM_EXEC_STALL_LOAD_FINISH",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was finishing a load after its data was reloaded from a data source beyond the local L1; cycles in which the LSU was processing an L1-hit; cycles in which the NTF instruction merged with another load in the LMQ."
+  },
+  {
+    "EventCode": "3006C",
+    "EventName": "PM_RUN_CYC_SMT2_MODE",
+    "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT2 mode."
+  },
+  {
+    "EventCode": "300F4",
+    "EventName": "PM_RUN_INST_CMPL_CONC",
+    "BriefDescription": "PowerPC instructions completed by this thread when all threads in the core had the run-latch set."
+  },
+  {
+    "EventCode": "4C016",
+    "EventName": "PM_EXEC_STALL_DMISS_L2L3_CONFLICT",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, with a dispatch conflict."
+  },
+  {
+    "EventCode": "4D014",
+    "EventName": "PM_EXEC_STALL_LOAD",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a load instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "4D016",
+    "EventName": "PM_EXEC_STALL_PTESYNC",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a PTESYNC instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "401EA",
+    "EventName": "PM_THRESH_EXC_128",
+    "BriefDescription": "Threshold counter exceeded a value of 128."
+  },
+  {
+    "EventCode": "400F6",
+    "EventName": "PM_BR_MPRED_CMPL",
+    "BriefDescription": "A mispredicted branch completed. Includes direction and target."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json b/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json
new file mode 100644
index 000000000000..703cd431ae5b
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/floating_point.json
@@ -0,0 +1,7 @@
+[
+  {
+    "EventCode": "4016E",
+    "EventName": "PM_THRESH_NOT_MET",
+    "BriefDescription": "Threshold counter did not meet threshold."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/frontend.json b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
new file mode 100644
index 000000000000..eac8609dcc90
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/frontend.json
@@ -0,0 +1,217 @@
+[
+  {
+    "EventCode": "10004",
+    "EventName": "PM_EXEC_STALL_TRANSLATION",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss or ERAT miss and waited for it to resolve."
+  },
+  {
+    "EventCode": "10010",
+    "EventName": "PM_PMC4_OVERFLOW",
+    "BriefDescription": "The event selected for PMC4 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "10020",
+    "EventName": "PM_PMC4_REWIND",
+    "BriefDescription": "The speculative event selected for PMC4 rewinds and the counter for PMC4 is not charged."
+  },
+  {
+    "EventCode": "10038",
+    "EventName": "PM_DISP_STALL_TRANSLATION",
+    "BriefDescription": "Cycles when dispatch was stalled for this thread because the MMU was handling a translation miss."
+  },
+  {
+    "EventCode": "1003A",
+    "EventName": "PM_DISP_STALL_BR_MPRED_IC_L2",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2 after suffering a branch mispredict."
+  },
+  {
+    "EventCode": "1E050",
+    "EventName": "PM_DISP_STALL_HELD_STF_MAPPER_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the STF mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR."
+  },
+  {
+    "EventCode": "1F054",
+    "EventName": "PM_DTLB_HIT",
+    "BriefDescription": "The PTE required by the instruction was resident in the TLB (data TLB access). When MMCR1[16]=0 this event counts only demand hits. When MMCR1[16]=1 this event includes demand and prefetch. Applies to both HPT and RPT."
+  },
+  {
+    "EventCode": "101E8",
+    "EventName": "PM_THRESH_EXC_256",
+    "BriefDescription": "Threshold counter exceeded a count of 256."
+  },
+  {
+    "EventCode": "101EC",
+    "EventName": "PM_THRESH_MET",
+    "BriefDescription": "Threshold exceeded."
+  },
+  {
+    "EventCode": "100F2",
+    "EventName": "PM_1PLUS_PPC_CMPL",
+    "BriefDescription": "Cycles in which at least one instruction is completed by this thread."
+  },
+  {
+    "EventCode": "100F6",
+    "EventName": "PM_IERAT_MISS",
+    "BriefDescription": "IERAT Reloaded to satisfy an IERAT miss. All page sizes are counted by this event."
+  },
+  {
+    "EventCode": "100F8",
+    "EventName": "PM_DISP_STALL_CYC",
+    "BriefDescription": "Cycles the ICT has no itags assigned to this thread (no instructions were dispatched during these cycles)."
+  },
+  {
+    "EventCode": "20114",
+    "EventName": "PM_MRK_L2_RC_DISP",
+    "BriefDescription": "Marked instruction RC dispatched in L2."
+  },
+  {
+    "EventCode": "2C010",
+    "EventName": "PM_EXEC_STALL_LSU",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Load Store Unit. This does not include simple fixed point instructions."
+  },
+  {
+    "EventCode": "2C016",
+    "EventName": "PM_DISP_STALL_IERAT_ONLY_MISS",
+    "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction ERAT miss."
+  },
+  {
+    "EventCode": "2C01E",
+    "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3 after suffering a branch mispredict."
+  },
+  {
+    "EventCode": "2D01A",
+    "EventName": "PM_DISP_STALL_IC_MISS",
+    "BriefDescription": "Cycles when dispatch was stalled for this thread due to an Icache Miss."
+  },
+  {
+    "EventCode": "2D01C",
+    "EventName": "PM_CMPL_STALL_STCX",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a stcx waiting for resolution from the nest before completing."
+  },
+  {
+    "EventCode": "2E018",
+    "EventName": "PM_DISP_STALL_FETCH",
+    "BriefDescription": "Cycles when dispatch was stalled for this thread because Fetch was being held."
+  },
+  {
+    "EventCode": "2E01A",
+    "EventName": "PM_DISP_STALL_HELD_XVFC_MAPPER_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the XVFC mapper/SRB was full."
+  },
+  {
+    "EventCode": "2C142",
+    "EventName": "PM_MRK_XFER_FROM_SRC_PMC2",
+    "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "24050",
+    "EventName": "PM_IOPS_DISP",
+    "BriefDescription": "Internal Operations dispatched. PM_IOPS_DISP / PM_INST_DISP will show the average number of internal operations per PowerPC instruction."
+  },
+  {
+    "EventCode": "2405E",
+    "EventName": "PM_ISSUE_CANCEL",
+    "BriefDescription": "An instruction issued and the issue was later cancelled. Only one cancel per PowerPC instruction."
+  },
+  {
+    "EventCode": "200FA",
+    "EventName": "PM_BR_TAKEN_CMPL",
+    "BriefDescription": "Branch Taken instruction completed."
+  },
+  {
+    "EventCode": "30012",
+    "EventName": "PM_FLUSH_COMPLETION",
+    "BriefDescription": "The instruction that was next to complete (oldest in the pipeline) did not complete because it suffered a flush."
+  },
+  {
+    "EventCode": "30014",
+    "EventName": "PM_EXEC_STALL_STORE",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "30018",
+    "EventName": "PM_DISP_STALL_HELD_SCOREBOARD_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch while waiting on the Scoreboard. This event combines VSCR and FPSCR together."
+  },
+  {
+    "EventCode": "30026",
+    "EventName": "PM_EXEC_STALL_STORE_MISS",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a store whose cache line was not resident in the L1 and was waiting for allocation of the missing line into the L1."
+  },
+  {
+    "EventCode": "3012A",
+    "EventName": "PM_MRK_L2_RC_DONE",
+    "BriefDescription": "L2 RC machine completed the transaction for the marked instruction."
+  },
+  {
+    "EventCode": "3F046",
+    "EventName": "PM_ITLB_HIT_1G",
+    "BriefDescription": "Instruction TLB hit (IERAT reload) page size 1G, which implies Radix Page Table translation is in use. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "34058",
+    "EventName": "PM_DISP_STALL_BR_MPRED_ICMISS",
+    "BriefDescription": "Cycles when dispatch was stalled after a mispredicted branch resulted in an instruction cache miss."
+  },
+  {
+    "EventCode": "3D05C",
+    "EventName": "PM_DISP_STALL_HELD_RENAME_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because the mapper/SRB was full. Includes GPR (count, link, tar), VSR, VMR, FPR and XVFC."
+  },
+  {
+    "EventCode": "3E052",
+    "EventName": "PM_DISP_STALL_IC_L3",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L3."
+  },
+  {
+    "EventCode": "3E054",
+    "EventName": "PM_LD_MISS_L1",
+    "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load."
+  },
+  {
+    "EventCode": "301EA",
+    "EventName": "PM_THRESH_EXC_1024",
+    "BriefDescription": "Threshold counter exceeded a value of 1024."
+  },
+  {
+    "EventCode": "300FA",
+    "EventName": "PM_INST_FROM_L3MISS",
+    "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss."
+  },
+  {
+    "EventCode": "40006",
+    "EventName": "PM_ISSUE_KILL",
+    "BriefDescription": "Cycles in which an instruction or group of instructions were cancelled after being issued. This event increments once per occurrence, regardless of how many instructions are included in the issue group."
+  },
+  {
+    "EventCode": "40116",
+    "EventName": "PM_MRK_LARX_FIN",
+    "BriefDescription": "Marked load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "4C010",
+    "EventName": "PM_DISP_STALL_BR_MPRED_IC_L3MISS",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from sources beyond the local L3 after suffering a mispredicted branch."
+  },
+  {
+    "EventCode": "4D01E",
+    "EventName": "PM_DISP_STALL_BR_MPRED",
+    "BriefDescription": "Cycles when dispatch was stalled for this thread due to a mispredicted branch."
+  },
+  {
+    "EventCode": "4E010",
+    "EventName": "PM_DISP_STALL_IC_L3MISS",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from any source beyond the local L3."
+  },
+  {
+    "EventCode": "4E01A",
+    "EventName": "PM_DISP_STALL_HELD_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any reason."
+  },
+  {
+    "EventCode": "44056",
+    "EventName": "PM_VECTOR_ST_CMPL",
+    "BriefDescription": "Vector store instructions completed."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/locks.json b/tools/perf/pmu-events/arch/powerpc/power10/locks.json
new file mode 100644
index 000000000000..016d8de0e14a
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/locks.json
@@ -0,0 +1,12 @@
+[
+  {
+    "EventCode": "1E058",
+    "EventName": "PM_STCX_FAIL_FIN",
+    "BriefDescription": "Conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "4E050",
+    "EventName": "PM_STCX_PASS_FIN",
+    "BriefDescription": "Conditional store instruction (STCX) passed. LARX and STCX are instructions used to acquire a lock."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/marked.json b/tools/perf/pmu-events/arch/powerpc/power10/marked.json
new file mode 100644
index 000000000000..93a5a5910648
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/marked.json
@@ -0,0 +1,147 @@
+[
+  {
+    "EventCode": "1002C",
+    "EventName": "PM_LD_PREFETCH_CACHE_LINE_MISS",
+    "BriefDescription": "The L1 cache was reloaded with a line that fulfills a prefetch request."
+  },
+  {
+    "EventCode": "10132",
+    "EventName": "PM_MRK_INST_ISSUED",
+    "BriefDescription": "Marked instruction issued. Note that stores always get issued twice, the address gets issued to the LSU and the data gets issued to the VSU. Also, issues can sometimes get killed/cancelled and cause multiple sequential issues for the same instruction."
+  },
+  {
+    "EventCode": "101E0",
+    "EventName": "PM_MRK_INST_DISP",
+    "BriefDescription": "The thread has dispatched a randomly sampled marked instruction."
+  },
+  {
+    "EventCode": "101E2",
+    "EventName": "PM_MRK_BR_TAKEN_CMPL",
+    "BriefDescription": "Marked Branch Taken instruction completed."
+  },
+  {
+    "EventCode": "20112",
+    "EventName": "PM_MRK_NTF_FIN",
+    "BriefDescription": "The marked instruction became the oldest in the pipeline before it finished. It excludes instructions that finish at dispatch."
+  },
+  {
+    "EventCode": "2C01C",
+    "EventName": "PM_EXEC_STALL_DMISS_OFF_CHIP",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a remote chip."
+  },
+  {
+    "EventCode": "20138",
+    "EventName": "PM_MRK_ST_NEST",
+    "BriefDescription": "A store has been sampled/marked and is at the point of execution where it has completed in the core and can no longer be flushed. At this point the store is sent to the L2."
+  },
+  {
+    "EventCode": "2013A",
+    "EventName": "PM_MRK_BRU_FIN",
+    "BriefDescription": "Marked Branch instruction finished."
+  },
+  {
+    "EventCode": "2C144",
+    "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC2",
+    "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[15:27]."
+  },
+  {
+    "EventCode": "24156",
+    "EventName": "PM_MRK_STCX_FIN",
+    "BriefDescription": "Marked conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "24158",
+    "EventName": "PM_MRK_INST",
+    "BriefDescription": "An instruction was marked. Includes both Random Instruction Sampling (RIS) at decode time and Random Event Sampling (RES) at the time the configured event happens."
+  },
+  {
+    "EventCode": "2415C",
+    "EventName": "PM_MRK_BR_CMPL",
+    "BriefDescription": "A marked branch completed. All branches are included."
+  },
+  {
+    "EventCode": "200FD",
+    "EventName": "PM_L1_ICACHE_MISS",
+    "BriefDescription": "Demand iCache Miss."
+  },
+  {
+    "EventCode": "30130",
+    "EventName": "PM_MRK_INST_FIN",
+    "BriefDescription": "marked instruction finished. Excludes instructions that finish at dispatch. Note that stores always finish twice since the address gets issued to the LSU and the data gets issued to the VSU."
+  },
+  {
+    "EventCode": "34146",
+    "EventName": "PM_MRK_LD_CMPL",
+    "BriefDescription": "Marked loads completed."
+  },
+  {
+    "EventCode": "3E158",
+    "EventName": "PM_MRK_STCX_FAIL",
+    "BriefDescription": "Marked conditional store instruction (STCX) failed. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "3E15A",
+    "EventName": "PM_MRK_ST_FIN",
+    "BriefDescription": "The marked instruction was a store of any kind."
+  },
+  {
+    "EventCode": "30068",
+    "EventName": "PM_L1_ICACHE_RELOADED_PREF",
+    "BriefDescription": "Counts all Icache prefetch reloads ( includes demand turned into prefetch)."
+  },
+  {
+    "EventCode": "301E4",
+    "EventName": "PM_MRK_BR_MPRED_CMPL",
+    "BriefDescription": "Marked Branch Mispredicted. Includes direction and target."
+  },
+  {
+    "EventCode": "300F6",
+    "EventName": "PM_LD_DEMAND_MISS_L1",
+    "BriefDescription": "The L1 cache was reloaded with a line that fulfills a demand miss request. Counted at reload time, before finish."
+  },
+  {
+    "EventCode": "300FE",
+    "EventName": "PM_DATA_FROM_L3MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss."
+  },
+  {
+    "EventCode": "40012",
+    "EventName": "PM_L1_ICACHE_RELOADED_ALL",
+    "BriefDescription": "Counts all Icache reloads includes demand, prefetch, prefetch turned into demand and demand turned into prefetch."
+  },
+  {
+    "EventCode": "40134",
+    "EventName": "PM_MRK_INST_TIMEO",
+    "BriefDescription": "Marked instruction finish timeout (instruction was lost)."
+  },
+  {
+    "EventCode": "4003C",
+    "EventName": "PM_DISP_STALL_HELD_SYNC_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of a synchronizing instruction that requires the ICT to be empty before dispatch."
+  },
+  {
+    "EventCode": "4505A",
+    "EventName": "PM_SP_FLOP_CMPL",
+    "BriefDescription": "Single Precision floating point instructions completed."
+  },
+  {
+    "EventCode": "4D058",
+    "EventName": "PM_VECTOR_FLOP_CMPL",
+    "BriefDescription": "Vector floating point instructions completed."
+  },
+  {
+    "EventCode": "4D05A",
+    "EventName": "PM_NON_MATH_FLOP_CMPL",
+    "BriefDescription": "Non Math instructions completed."
+  },
+  {
+    "EventCode": "401E0",
+    "EventName": "PM_MRK_INST_CMPL",
+    "BriefDescription": "marked instruction completed."
+  },
+  {
+    "EventCode": "400FE",
+    "EventName": "PM_DATA_FROM_MEMORY",
+    "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/memory.json b/tools/perf/pmu-events/arch/powerpc/power10/memory.json
new file mode 100644
index 000000000000..b01141eeebee
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/memory.json
@@ -0,0 +1,192 @@
+[
+  {
+    "EventCode": "1000A",
+    "EventName": "PM_PMC3_REWIND",
+    "BriefDescription": "The speculative event selected for PMC3 rewinds and the counter for PMC3 is not charged."
+  },
+  {
+    "EventCode": "1C040",
+    "EventName": "PM_XFER_FROM_SRC_PMC1",
+    "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "1C142",
+    "EventName": "PM_MRK_XFER_FROM_SRC_PMC1",
+    "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[0:12]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "1C144",
+    "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC1",
+    "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[0:12]."
+  },
+  {
+    "EventCode": "1C056",
+    "EventName": "PM_DERAT_MISS_4K",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 4K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "1C058",
+    "EventName": "PM_DTLB_MISS_16G",
+    "BriefDescription": "Data TLB reload (after a miss) page size 16G. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "1C05C",
+    "EventName": "PM_DTLB_MISS_2M",
+    "BriefDescription": "Data TLB reload (after a miss) page size 2M. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "1E056",
+    "EventName": "PM_EXEC_STALL_STORE_PIPE",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the store unit. This does not include cycles spent handling store misses, PTESYNC instructions or TLBIE instructions."
+  },
+  {
+    "EventCode": "1F150",
+    "EventName": "PM_MRK_ST_L2_CYC",
+    "BriefDescription": "Cycles from L2 RC dispatch to L2 RC completion."
+  },
+  {
+    "EventCode": "10062",
+    "EventName": "PM_LD_L3MISS_PEND_CYC",
+    "BriefDescription": "Cycles L3 miss was pending for this thread."
+  },
+  {
+    "EventCode": "20010",
+    "EventName": "PM_PMC1_OVERFLOW",
+    "BriefDescription": "The event selected for PMC1 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "2001A",
+    "EventName": "PM_ITLB_HIT",
+    "BriefDescription": "The PTE required to translate the instruction address was resident in the TLB (instruction TLB access/IERAT reload). Applies to both HPT and RPT. When MMCR1[17]=0 this event counts only for demand misses. When MMCR1[17]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "2003E",
+    "EventName": "PM_PTESYNC_FIN",
+    "BriefDescription": "Ptesync instruction finished in the store unit. Only one ptesync can finish at a time."
+  },
+  {
+    "EventCode": "2C040",
+    "EventName": "PM_XFER_FROM_SRC_PMC2",
+    "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[15:27]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "2C054",
+    "EventName": "PM_DERAT_MISS_64K",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "2C056",
+    "EventName": "PM_DTLB_MISS_4K",
+    "BriefDescription": "Data TLB reload (after a miss) page size 4K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "2D154",
+    "EventName": "PM_MRK_DERAT_MISS_64K",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 64K for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "200F6",
+    "EventName": "PM_DERAT_MISS",
+    "BriefDescription": "DERAT Reloaded to satisfy a DERAT miss. All page sizes are counted by this event. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "3000A",
+    "EventName": "PM_DISP_STALL_ITLB_MISS",
+    "BriefDescription": "Cycles when dispatch was stalled while waiting to resolve an instruction TLB miss."
+  },
+  {
+    "EventCode": "30016",
+    "EventName": "PM_EXEC_STALL_DERAT_DTLB_MISS",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered a TLB miss and waited for it resolve."
+  },
+  {
+    "EventCode": "3C040",
+    "EventName": "PM_XFER_FROM_SRC_PMC3",
+    "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "3C142",
+    "EventName": "PM_MRK_XFER_FROM_SRC_PMC3",
+    "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[30:42]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "3C144",
+    "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC3",
+    "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[30:42]."
+  },
+  {
+    "EventCode": "3C054",
+    "EventName": "PM_DERAT_MISS_16M",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 16M. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "3C056",
+    "EventName": "PM_DTLB_MISS_64K",
+    "BriefDescription": "Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "3C058",
+    "EventName": "PM_LARX_FIN",
+    "BriefDescription": "Load and reserve instruction (LARX) finished. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "301E2",
+    "EventName": "PM_MRK_ST_CMPL",
+    "BriefDescription": "Marked store completed and sent to nest. Note that this count excludes cache-inhibited stores."
+  },
+  {
+    "EventCode": "300FC",
+    "EventName": "PM_DTLB_MISS",
+    "BriefDescription": "The DPTEG required for the load/store instruction in execution was missing from the TLB. It includes pages of all sizes for demand and prefetch activity."
+  },
+  {
+    "EventCode": "4D02C",
+    "EventName": "PM_PMC1_REWIND",
+    "BriefDescription": "The speculative event selected for PMC1 rewinds and the counter for PMC1 is not charged."
+  },
+  {
+    "EventCode": "4003E",
+    "EventName": "PM_LD_CMPL",
+    "BriefDescription": "Loads completed."
+  },
+  {
+    "EventCode": "4C040",
+    "EventName": "PM_XFER_FROM_SRC_PMC4",
+    "BriefDescription": "The processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "4C142",
+    "EventName": "PM_MRK_XFER_FROM_SRC_PMC4",
+    "BriefDescription": "For a marked data transfer instruction, the processor's L1 data cache was reloaded from the source specified in MMCR3[45:57]. If MMCR1[16|17] is 0 (default), this count includes only lines that were reloaded to satisfy a demand miss. If MMCR1[16|17] is 1, this count includes both demand misses and prefetch reloads."
+  },
+  {
+    "EventCode": "4C144",
+    "EventName": "PM_MRK_XFER_FROM_SRC_CYC_PMC4",
+    "BriefDescription": "Cycles taken for a marked demand miss to reload a line from the source specified in MMCR3[45:57]."
+  },
+  {
+    "EventCode": "4C056",
+    "EventName": "PM_DTLB_MISS_16M",
+    "BriefDescription": "Data TLB reload (after a miss) page size 16M. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "4C05A",
+    "EventName": "PM_DTLB_MISS_1G",
+    "BriefDescription": "Data TLB reload (after a miss) page size 1G. Implies radix translation was used. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "4C15E",
+    "EventName": "PM_MRK_DTLB_MISS_64K",
+    "BriefDescription": "Marked Data TLB reload (after a miss) page size 64K. When MMCR1[16]=0 this event counts only for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "4D056",
+    "EventName": "PM_NON_FMA_FLOP_CMPL",
+    "BriefDescription": "Non FMA instruction completed."
+  },
+  {
+    "EventCode": "40164",
+    "EventName": "PM_MRK_DERAT_MISS_2M",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M for a marked instruction. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/others.json b/tools/perf/pmu-events/arch/powerpc/power10/others.json
new file mode 100644
index 000000000000..a119e56cbf1c
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/others.json
@@ -0,0 +1,297 @@
+[
+  {
+    "EventCode": "10016",
+    "EventName": "PM_VSU0_ISSUE",
+    "BriefDescription": "VSU instructions issued to VSU pipe 0."
+  },
+  {
+    "EventCode": "1001C",
+    "EventName": "PM_ULTRAVISOR_INST_CMPL",
+    "BriefDescription": "PowerPC instructions that completed while the thread was in ultravisor state."
+  },
+  {
+    "EventCode": "100F0",
+    "EventName": "PM_CYC",
+    "BriefDescription": "Processor cycles."
+  },
+  {
+    "EventCode": "10134",
+    "EventName": "PM_MRK_ST_DONE_L2",
+    "BriefDescription": "Marked stores completed in L2 (RC machine done)."
+  },
+  {
+    "EventCode": "1505E",
+    "EventName": "PM_LD_HIT_L1",
+    "BriefDescription": "Loads that finished without experiencing an L1 miss."
+  },
+  {
+    "EventCode": "1D05E",
+    "EventName": "PM_DISP_STALL_HELD_HALT_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch because of power management."
+  },
+  {
+    "EventCode": "1E054",
+    "EventName": "PM_EXEC_STALL_DMISS_L21_L31",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from another core's L2 or L3 on the same chip."
+  },
+  {
+    "EventCode": "1E05A",
+    "EventName": "PM_CMPL_STALL_LWSYNC",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a lwsync waiting to complete."
+  },
+  {
+    "EventCode": "1F056",
+    "EventName": "PM_DISP_SS0_2_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 0 dispatches either 1 or 2 instructions."
+  },
+  {
+    "EventCode": "1F15C",
+    "EventName": "PM_MRK_STCX_L2_CYC",
+    "BriefDescription": "Cycles spent in the nest portion of a marked Stcx instruction. It starts counting when the operation starts to drain to the L2 and it stops counting when the instruction retires from the Instruction Completion Table (ICT) in the Instruction Sequencing Unit (ISU)."
+  },
+  {
+    "EventCode": "10066",
+    "EventName": "PM_ADJUNCT_CYC",
+    "BriefDescription": "Cycles in which the thread is in Adjunct state. MSR[S HV PR] bits = 011."
+  },
+  {
+    "EventCode": "101E4",
+    "EventName": "PM_MRK_L1_ICACHE_MISS",
+    "BriefDescription": "Marked Instruction suffered an icache Miss."
+  },
+  {
+    "EventCode": "101EA",
+    "EventName": "PM_MRK_L1_RELOAD_VALID",
+    "BriefDescription": "Marked demand reload."
+  },
+  {
+    "EventCode": "100F4",
+    "EventName": "PM_FLOP_CMPL",
+    "BriefDescription": "Floating Point Operations Completed. Includes any type. It counts once for each 1, 2, 4 or 8 flop instruction. Use PM_1|2|4|8_FLOP_CMPL events to count flops."
+  },
+  {
+    "EventCode": "100FA",
+    "EventName": "PM_RUN_LATCH_ANY_THREAD_CYC",
+    "BriefDescription": "Cycles when at least one thread has the run latch set."
+  },
+  {
+    "EventCode": "100FC",
+    "EventName": "PM_LD_REF_L1",
+    "BriefDescription": "All L1 D cache load references counted at finish, gated by reject. In P9 and earlier this event counted only cacheable loads but in P10 both cacheable and non-cacheable loads are included."
+  },
+  {
+    "EventCode": "20006",
+    "EventName": "PM_DISP_STALL_HELD_ISSQ_FULL_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch due to Issue queue full. Includes issue queue and branch queue."
+  },
+  {
+    "EventCode": "2000C",
+    "EventName": "PM_RUN_LATCH_ALL_THREADS_CYC",
+    "BriefDescription": "Cycles when the run latch is set for all threads."
+  },
+  {
+    "EventCode": "2E010",
+    "EventName": "PM_ADJUNCT_INST_CMPL",
+    "BriefDescription": "PowerPC instructions that completed while the thread is in Adjunct state."
+  },
+  {
+    "EventCode": "2E014",
+    "EventName": "PM_STCX_FIN",
+    "BriefDescription": "Conditional store instruction (STCX) finished. LARX and STCX are instructions used to acquire a lock."
+  },
+  {
+    "EventCode": "20130",
+    "EventName": "PM_MRK_INST_DECODED",
+    "BriefDescription": "An instruction was marked at decode time. Random Instruction Sampling (RIS) only."
+  },
+  {
+    "EventCode": "20132",
+    "EventName": "PM_MRK_DFU_ISSUE",
+    "BriefDescription": "The marked instruction was a decimal floating point operation issued to the VSU. Measured at issue time."
+  },
+  {
+    "EventCode": "20134",
+    "EventName": "PM_MRK_FXU_ISSUE",
+    "BriefDescription": "The marked instruction was a fixed point operation issued to the VSU. Measured at issue time."
+  },
+  {
+    "EventCode": "2505C",
+    "EventName": "PM_VSU_ISSUE",
+    "BriefDescription": "At least one VSU instruction was issued to one of the VSU pipes. Up to 4 per cycle. Includes fixed point operations."
+  },
+  {
+    "EventCode": "2F054",
+    "EventName": "PM_DISP_SS1_2_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 1 dispatches either 1 or 2 instructions."
+  },
+  {
+    "EventCode": "2F056",
+    "EventName": "PM_DISP_SS1_4_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 1 dispatches either 3 or 4 instructions."
+  },
+  {
+    "EventCode": "2006C",
+    "EventName": "PM_RUN_CYC_SMT4_MODE",
+    "BriefDescription": "Cycles when this thread's run latch is set and the core is in SMT4 mode."
+  },
+  {
+    "EventCode": "201E0",
+    "EventName": "PM_MRK_DATA_FROM_MEMORY",
+    "BriefDescription": "The processor's data cache was reloaded from local, remote, or distant memory due to a demand miss for a marked load."
+  },
+  {
+    "EventCode": "201E4",
+    "EventName": "PM_MRK_DATA_FROM_L3MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked load."
+  },
+  {
+    "EventCode": "201E8",
+    "EventName": "PM_THRESH_EXC_512",
+    "BriefDescription": "Threshold counter exceeded a value of 512."
+  },
+  {
+    "EventCode": "200F2",
+    "EventName": "PM_INST_DISP",
+    "BriefDescription": "PowerPC instructions dispatched."
+  },
+  {
+    "EventCode": "30132",
+    "EventName": "PM_MRK_VSU_FIN",
+    "BriefDescription": "VSU marked instructions finished. Excludes simple FX instructions issued to the Store Unit."
+  },
+  {
+    "EventCode": "30038",
+    "EventName": "PM_EXEC_STALL_DMISS_LMEM",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local memory, local OpenCapp cache, or local OpenCapp memory."
+  },
+  {
+    "EventCode": "3F04A",
+    "EventName": "PM_LSU_ST5_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST2 port."
+  },
+  {
+    "EventCode": "34054",
+    "EventName": "PM_EXEC_STALL_DMISS_L2L3_NOCONFLICT",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from the local L2 or local L3, without a dispatch conflict."
+  },
+  {
+    "EventCode": "3405A",
+    "EventName": "PM_PRIVILEGED_INST_CMPL",
+    "BriefDescription": "PowerPC Instructions that completed while the thread is in Privileged state."
+  },
+  {
+    "EventCode": "3F150",
+    "EventName": "PM_MRK_ST_DRAIN_CYC",
+    "BriefDescription": "cycles to drain st from core to L2."
+  },
+  {
+    "EventCode": "3F054",
+    "EventName": "PM_DISP_SS0_4_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 0 dispatches either 3 or 4 instructions."
+  },
+  {
+    "EventCode": "3F056",
+    "EventName": "PM_DISP_SS0_8_INSTR_CYC",
+    "BriefDescription": "Cycles in which Superslice 0 dispatches either 5, 6, 7 or 8 instructions."
+  },
+  {
+    "EventCode": "30162",
+    "EventName": "PM_MRK_ISSUE_DEPENDENT_LOAD",
+    "BriefDescription": "The marked instruction was dependent on a load. It is eligible for issue kill."
+  },
+  {
+    "EventCode": "40114",
+    "EventName": "PM_MRK_START_PROBE_NOP_DISP",
+    "BriefDescription": "Marked Start probe nop dispatched. Instruction AND R0,R0,R0."
+  },
+  {
+    "EventCode": "4001C",
+    "EventName": "PM_VSU_FIN",
+    "BriefDescription": "VSU instructions finished."
+  },
+  {
+    "EventCode": "4C01A",
+    "EventName": "PM_EXEC_STALL_DMISS_OFF_NODE",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a distant chip."
+  },
+  {
+    "EventCode": "4D012",
+    "EventName": "PM_PMC3_SAVED",
+    "BriefDescription": "The conditions for the speculative event selected for PMC3 are met and PMC3 is charged."
+  },
+  {
+    "EventCode": "4D022",
+    "EventName": "PM_HYPERVISOR_INST_CMPL",
+    "BriefDescription": "PowerPC instructions that completed while the thread is in hypervisor state."
+  },
+  {
+    "EventCode": "4D026",
+    "EventName": "PM_ULTRAVISOR_CYC",
+    "BriefDescription": "Cycles when the thread is in Ultravisor state. MSR[S HV PR]=110."
+  },
+  {
+    "EventCode": "4D028",
+    "EventName": "PM_PRIVILEGED_CYC",
+    "BriefDescription": "Cycles when the thread is in Privileged state. MSR[S HV PR]=x00."
+  },
+  {
+    "EventCode": "40030",
+    "EventName": "PM_INST_FIN",
+    "BriefDescription": "Instructions finished."
+  },
+  {
+    "EventCode": "44146",
+    "EventName": "PM_MRK_STCX_CORE_CYC",
+    "BriefDescription": "Cycles spent in the core portion of a marked Stcx instruction. It starts counting when the instruction is decoded and stops counting when it drains into the L2."
+  },
+  {
+    "EventCode": "44054",
+    "EventName": "PM_VECTOR_LD_CMPL",
+    "BriefDescription": "Vector load instructions completed."
+  },
+  {
+    "EventCode": "45054",
+    "EventName": "PM_FMA_CMPL",
+    "BriefDescription": "Two floating point instructions completed (FMA class of instructions: fmadd, fnmadd, fmsub, fnmsub). Scalar instructions only."
+  },
+  {
+    "EventCode": "45056",
+    "EventName": "PM_SCALAR_FLOP_CMPL",
+    "BriefDescription": "Scalar floating point instructions completed."
+  },
+  {
+    "EventCode": "4505C",
+    "EventName": "PM_MATH_FLOP_CMPL",
+    "BriefDescription": "Math floating point instructions completed."
+  },
+  {
+    "EventCode": "4D05E",
+    "EventName": "PM_BR_CMPL",
+    "BriefDescription": "A branch completed. All branches are included."
+  },
+  {
+    "EventCode": "4E15E",
+    "EventName": "PM_MRK_INST_FLUSHED",
+    "BriefDescription": "The marked instruction was flushed."
+  },
+  {
+    "EventCode": "401E6",
+    "EventName": "PM_MRK_INST_FROM_L3MISS",
+    "BriefDescription": "The processor's instruction cache was reloaded from a source other than the local core's L1, L2, or L3 due to a demand miss for a marked instruction."
+  },
+  {
+    "EventCode": "401E8",
+    "EventName": "PM_MRK_DATA_FROM_L2MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss for a marked load."
+  },
+  {
+    "EventCode": "400F0",
+    "EventName": "PM_LD_DEMAND_MISS_L1_FIN",
+    "BriefDescription": "Load Missed L1, counted at finish time."
+  },
+  {
+    "EventCode": "400FA",
+    "EventName": "PM_RUN_INST_CMPL",
+    "BriefDescription": "Completed PowerPC instructions gated by the run latch."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json
new file mode 100644
index 000000000000..b61b5cc157ee
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/pipeline.json
@@ -0,0 +1,297 @@
+[
+  {
+    "EventCode": "100FE",
+    "EventName": "PM_INST_CMPL",
+    "BriefDescription": "PowerPC instructions completed."
+  },
+  {
+    "EventCode": "10006",
+    "EventName": "PM_DISP_STALL_HELD_OTHER_CYC",
+    "BriefDescription": "Cycles in which the NTC instruction is held at dispatch for any other reason."
+  },
+  {
+    "EventCode": "1000C",
+    "EventName": "PM_LSU_LD0_FIN",
+    "BriefDescription": "LSU Finished an internal operation in LD0 port."
+  },
+  {
+    "EventCode": "1000E",
+    "EventName": "PM_MMA_ISSUED",
+    "BriefDescription": "MMA instructions issued."
+  },
+  {
+    "EventCode": "10012",
+    "EventName": "PM_LSU_ST0_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST0 port."
+  },
+  {
+    "EventCode": "10014",
+    "EventName": "PM_LSU_ST4_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST4 port."
+  },
+  {
+    "EventCode": "10018",
+    "EventName": "PM_IC_DEMAND_CYC",
+    "BriefDescription": "Cycles in which an instruction reload is pending to satisfy a demand miss."
+  },
+  {
+    "EventCode": "10022",
+    "EventName": "PM_PMC2_SAVED",
+    "BriefDescription": "The conditions for the speculative event selected for PMC2 are met and PMC2 is charged."
+  },
+  {
+    "EventCode": "10024",
+    "EventName": "PM_PMC5_OVERFLOW",
+    "BriefDescription": "The event selected for PMC5 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "10058",
+    "EventName": "PM_EXEC_STALL_FIN_AT_DISP",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline finished at dispatch and did not require execution in the LSU, BRU or VSU."
+  },
+  {
+    "EventCode": "1005A",
+    "EventName": "PM_FLUSH_MPRED",
+    "BriefDescription": "A flush occurred due to a mispredicted branch. Includes target and direction."
+  },
+  {
+    "EventCode": "1C05A",
+    "EventName": "PM_DERAT_MISS_2M",
+    "BriefDescription": "Data ERAT Miss (Data TLB Access) page size 2M. Implies radix translation. When MMCR1[16]=0 this event counts only DERAT reloads for demand misses. When MMCR1[16]=1 this event includes demand misses and prefetches."
+  },
+  {
+    "EventCode": "10064",
+    "EventName": "PM_DISP_STALL_IC_L2",
+    "BriefDescription": "Cycles when dispatch was stalled while the instruction was fetched from the local L2."
+  },
+  {
+    "EventCode": "10068",
+    "EventName": "PM_BR_FIN",
+    "BriefDescription": "A branch instruction finished. Includes predicted/mispredicted/unconditional."
+  },
+  {
+    "EventCode": "1006A",
+    "EventName": "PM_FX_LSU_FIN",
+    "BriefDescription": "Simple fixed point instruction issued to the store unit. Measured at finish time."
+  },
+  {
+    "EventCode": "1006C",
+    "EventName": "PM_RUN_CYC_ST_MODE",
+    "BriefDescription": "Cycles when the run latch is set and the core is in ST mode."
+  },
+  {
+    "EventCode": "20004",
+    "EventName": "PM_ISSUE_STALL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was dispatched but not issued yet."
+  },
+  {
+    "EventCode": "2000A",
+    "EventName": "PM_HYPERVISOR_CYC",
+    "BriefDescription": "Cycles when the thread is in Hypervisor state. MSR[S HV PR]=010."
+  },
+  {
+    "EventCode": "2000E",
+    "EventName": "PM_LSU_LD1_FIN",
+    "BriefDescription": "LSU Finished an internal operation in LD1 port."
+  },
+  {
+    "EventCode": "2C014",
+    "EventName": "PM_CMPL_STALL_SPECIAL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline required special handling before completing."
+  },
+  {
+    "EventCode": "2C018",
+    "EventName": "PM_EXEC_STALL_DMISS_L3MISS",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for a load miss to resolve from a source beyond the local L2 or local L3."
+  },
+  {
+    "EventCode": "2D010",
+    "EventName": "PM_LSU_ST1_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST1 port."
+  },
+  {
+    "EventCode": "2D012",
+    "EventName": "PM_VSU1_ISSUE",
+    "BriefDescription": "VSU instructions issued to VSU pipe 1."
+  },
+  {
+    "EventCode": "2D018",
+    "EventName": "PM_EXEC_STALL_VSU",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the VSU (includes FXU, VSU, CRU)."
+  },
+  {
+    "EventCode": "2E01E",
+    "EventName": "PM_EXEC_STALL_NTC_FLUSH",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in any unit before it was flushed. Note that if the flush of the oldest instruction happens after finish, the cycles from dispatch to issue will be included in PM_DISP_STALL and the cycles from issue to finish will be included in PM_EXEC_STALL and its corresponding children."
+  },
+  {
+    "EventCode": "2013C",
+    "EventName": "PM_MRK_FX_LSU_FIN",
+    "BriefDescription": "The marked instruction was simple fixed point that was issued to the store unit. Measured at finish time."
+  },
+  {
+    "EventCode": "2405A",
+    "EventName": "PM_NTC_FIN",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline (NTC) finishes. Note that instructions can finish out of order, therefore not all the instructions that finish have a Next-to-complete status."
+  },
+  {
+    "EventCode": "201E2",
+    "EventName": "PM_MRK_LD_MISS_L1",
+    "BriefDescription": "Marked DL1 Demand Miss counted at finish time."
+  },
+  {
+    "EventCode": "200F4",
+    "EventName": "PM_RUN_CYC",
+    "BriefDescription": "Processor cycles gated by the run latch."
+  },
+  {
+    "EventCode": "30004",
+    "EventName": "PM_DISP_STALL_FLUSH",
+    "BriefDescription": "Cycles when dispatch was stalled because of a flush that happened to an instruction(s) that was not yet NTC. PM_EXEC_STALL_NTC_FLUSH only includes instructions that were flushed after becoming NTC."
+  },
+  {
+    "EventCode": "30008",
+    "EventName": "PM_EXEC_STALL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting to finish in one of the execution units (BRU, LSU, VSU). Only cycles between issue and finish are counted in this category."
+  },
+  {
+    "EventCode": "3001A",
+    "EventName": "PM_LSU_ST2_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST2 port."
+  },
+  {
+    "EventCode": "30020",
+    "EventName": "PM_PMC2_REWIND",
+    "BriefDescription": "The speculative event selected for PMC2 rewinds and the counter for PMC2 is not charged."
+  },
+  {
+    "EventCode": "30022",
+    "EventName": "PM_PMC4_SAVED",
+    "BriefDescription": "The conditions for the speculative event selected for PMC4 are met and PMC4 is charged."
+  },
+  {
+    "EventCode": "30024",
+    "EventName": "PM_PMC6_OVERFLOW",
+    "BriefDescription": "The event selected for PMC6 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "30028",
+    "EventName": "PM_CMPL_STALL_MEM_ECC",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was waiting for the non-speculative finish of either a stcx waiting for its result or a load waiting for non-critical sectors of data and ECC."
+  },
+  {
+    "EventCode": "30036",
+    "EventName": "PM_EXEC_STALL_SIMPLE_FX",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a simple fixed point instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "3003A",
+    "EventName": "PM_CMPL_STALL_EXCEPTION",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was not allowed to complete because it was interrupted by ANY exception, which has to be serviced before the instruction can complete."
+  },
+  {
+    "EventCode": "3F044",
+    "EventName": "PM_VSU2_ISSUE",
+    "BriefDescription": "VSU instructions issued to VSU pipe 2."
+  },
+  {
+    "EventCode": "30058",
+    "EventName": "PM_TLBIE_FIN",
+    "BriefDescription": "TLBIE instructions finished in the LSU. Two TLBIEs can finish each cycle. All will be counted."
+  },
+  {
+    "EventCode": "3D058",
+    "EventName": "PM_SCALAR_FSQRT_FDIV_ISSUE",
+    "BriefDescription": "Scalar versions of four floating point operations: fdiv,fsqrt (xvdivdp, xvdivsp, xvsqrtdp, xvsqrtsp)."
+  },
+  {
+    "EventCode": "30066",
+    "EventName": "PM_LSU_FIN",
+    "BriefDescription": "LSU Finished an internal operation (up to 4 per cycle)."
+  },
+  {
+    "EventCode": "40004",
+    "EventName": "PM_FXU_ISSUE",
+    "BriefDescription": "A fixed point instruction was issued to the VSU."
+  },
+  {
+    "EventCode": "40008",
+    "EventName": "PM_NTC_ALL_FIN",
+    "BriefDescription": "Cycles in which both instructions in the ICT entry pair show as finished. These are the cycles between finish and completion for the oldest pair of instructions in the pipeline."
+  },
+  {
+    "EventCode": "40010",
+    "EventName": "PM_PMC3_OVERFLOW",
+    "BriefDescription": "The event selected for PMC3 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "4C012",
+    "EventName": "PM_EXEC_STALL_DERAT_ONLY_MISS",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline suffered an ERAT miss and waited for it resolve."
+  },
+  {
+    "EventCode": "4C018",
+    "EventName": "PM_CMPL_STALL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline cannot complete because the thread was blocked for any reason."
+  },
+  {
+    "EventCode": "4C01E",
+    "EventName": "PM_LSU_ST3_FIN",
+    "BriefDescription": "LSU Finished an internal operation in ST3 port."
+  },
+  {
+    "EventCode": "4D018",
+    "EventName": "PM_EXEC_STALL_BRU",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was executing in the Branch unit."
+  },
+  {
+    "EventCode": "4D01A",
+    "EventName": "PM_CMPL_STALL_HWSYNC",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a hwsync waiting for response from L2 before completing."
+  },
+  {
+    "EventCode": "4D01C",
+    "EventName": "PM_EXEC_STALL_TLBIEL",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIEL instruction executing in the Load Store Unit. TLBIEL instructions have lower overhead than TLBIE instructions because they don't get set to the nest."
+  },
+  {
+    "EventCode": "4E012",
+    "EventName": "PM_EXEC_STALL_UNKNOWN",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline completed without an ntf_type pulse. The ntf_pulse was missed by the ISU because the NTF finishes and completions came too close together."
+  },
+  {
+    "EventCode": "4D020",
+    "EventName": "PM_VSU3_ISSUE",
+    "BriefDescription": "VSU instruction was issued to VSU pipe 3."
+  },
+  {
+    "EventCode": "40132",
+    "EventName": "PM_MRK_LSU_FIN",
+    "BriefDescription": "LSU marked instruction finish."
+  },
+  {
+    "EventCode": "45058",
+    "EventName": "PM_IC_MISS_CMPL",
+    "BriefDescription": "Non-speculative icache miss, counted at completion."
+  },
+  {
+    "EventCode": "4D050",
+    "EventName": "PM_VSU_NON_FLOP_CMPL",
+    "BriefDescription": "Non-floating point VSU instructions completed."
+  },
+  {
+    "EventCode": "4D052",
+    "EventName": "PM_2FLOP_CMPL",
+    "BriefDescription": "Double Precision vector version of fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg completed."
+  },
+  {
+    "EventCode": "400F2",
+    "EventName": "PM_1PLUS_PPC_DISP",
+    "BriefDescription": "Cycles at least one Instr Dispatched."
+  },
+  {
+    "EventCode": "400F8",
+    "EventName": "PM_FLUSH",
+    "BriefDescription": "Flush (any type)."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/pmc.json b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
new file mode 100644
index 000000000000..ea122a91ceb0
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/pmc.json
@@ -0,0 +1,22 @@
+[
+  {
+    "EventCode": "301E8",
+    "EventName": "PM_THRESH_EXC_64",
+    "BriefDescription": "Threshold counter exceeded a value of 64."
+  },
+  {
+    "EventCode": "45050",
+    "EventName": "PM_1FLOP_CMPL",
+    "BriefDescription": "One floating point instruction completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)."
+  },
+  {
+    "EventCode": "45052",
+    "EventName": "PM_4FLOP_CMPL",
+    "BriefDescription": "Four floating point instructions completed (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg)."
+  },
+  {
+    "EventCode": "4D054",
+    "EventName": "PM_8FLOP_CMPL",
+    "BriefDescription": "Four Double Precision vector instructions completed."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/translation.json b/tools/perf/pmu-events/arch/powerpc/power10/translation.json
new file mode 100644
index 000000000000..5a714e3dd71a
--- /dev/null
+++ b/tools/perf/pmu-events/arch/powerpc/power10/translation.json
@@ -0,0 +1,57 @@
+[
+  {
+    "EventCode": "1F15E",
+    "EventName": "PM_MRK_START_PROBE_NOP_CMPL",
+    "BriefDescription": "Marked Start probe nop (AND R0,R0,R0) completed."
+  },
+  {
+    "EventCode": "20016",
+    "EventName": "PM_ST_FIN",
+    "BriefDescription": "Store finish count. Includes speculative activity."
+  },
+  {
+    "EventCode": "20018",
+    "EventName": "PM_ST_FWD",
+    "BriefDescription": "Store forwards that finished."
+  },
+  {
+    "EventCode": "2011C",
+    "EventName": "PM_MRK_NTF_CYC",
+    "BriefDescription": "Cycles during which the marked instruction is the oldest in the pipeline (NTF or NTC)."
+  },
+  {
+    "EventCode": "2E01C",
+    "EventName": "PM_EXEC_STALL_TLBIE",
+    "BriefDescription": "Cycles in which the oldest instruction in the pipeline was a TLBIE instruction executing in the Load Store Unit."
+  },
+  {
+    "EventCode": "201E6",
+    "EventName": "PM_THRESH_EXC_32",
+    "BriefDescription": "Threshold counter exceeded a value of 32."
+  },
+  {
+    "EventCode": "200F0",
+    "EventName": "PM_ST_CMPL",
+    "BriefDescription": "Stores completed from S2Q (2nd-level store queue). This event includes regular stores, stcx and cache inhibited stores. The following operations are excluded (pteupdate, snoop tlbie complete, store atomics, miso, load atomic payloads, tlbie, tlbsync, slbieg, isync, msgsnd, slbiag, cpabort, copy, tcheck, tend, stsync, dcbst, icbi, dcbf, hwsync, lwsync, ptesync, eieio, msgsync)."
+  },
+  {
+    "EventCode": "200FE",
+    "EventName": "PM_DATA_FROM_L2MISS",
+    "BriefDescription": "The processor's data cache was reloaded from a source other than the local core's L1 or L2 due to a demand miss."
+  },
+  {
+    "EventCode": "30010",
+    "EventName": "PM_PMC2_OVERFLOW",
+    "BriefDescription": "The event selected for PMC2 caused the event counter to overflow."
+  },
+  {
+    "EventCode": "4D010",
+    "EventName": "PM_PMC1_SAVED",
+    "BriefDescription": "The conditions for the speculative event selected for PMC1 are met and PMC1 is charged."
+  },
+  {
+    "EventCode": "4D05C",
+    "EventName": "PM_DPP_FLOP_CMPL",
+    "BriefDescription": "Double-Precision or Quad-Precision instructions completed."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power8/metrics.json b/tools/perf/pmu-events/arch/powerpc/power8/metrics.json
index fc4aa6c2ddc9..4e25525b7da6 100644
--- a/tools/perf/pmu-events/arch/powerpc/power8/metrics.json
+++ b/tools/perf/pmu-events/arch/powerpc/power8/metrics.json
@@ -885,37 +885,37 @@
         "MetricName": "flush_rate_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (11 to 14) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (11 to 14) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_11_14_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_11to14_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (15 to 17) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (15 to 17) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_15_17_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_15to17_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization 18+ as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization 18+ as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_18_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_18plus_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (1 to 2) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (1 to 2) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_1_2_ENTRIES /  ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_1to2_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (3 to 6) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (3 to 6) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_3_6_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_3to6_slots_percent"
     },
     {
-        "BriefDescription": "GCT slot utilization (7 to 10) as a % of cycles this thread had atleast 1 slot valid",
+        "BriefDescription": "GCT slot utilization (7 to 10) as a % of cycles this thread had at least 1 slot valid",
         "MetricExpr": "PM_GCT_UTIL_7_10_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100",
         "MetricGroup": "general",
         "MetricName": "gct_util_7to10_slots_percent"
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/metrics.json b/tools/perf/pmu-events/arch/powerpc/power9/metrics.json
index f8784c608479..db86ba36224d 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/metrics.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/metrics.json
@@ -1210,156 +1210,24 @@
         "MetricName": "inst_from_rmem_percent"
     },
     {
-        "BriefDescription": "%L2 Modified CO Cache read Utilization (4 pclks per disp attempt)",
-        "MetricExpr": "((PM_L2_CASTOUT_MOD/2)*4)/ PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_co_m_rd_util"
-    },
-    {
-        "BriefDescription": "L2 dcache invalidates per run inst (per core)",
-        "MetricExpr": "(PM_L2_DC_INV / 2) / PM_RUN_INST_CMPL * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_dc_inv_rate_percent"
-    },
-    {
         "BriefDescription": "Demand load misses as a % of L2 LD dispatches (per thread)",
         "MetricExpr": "PM_L1_DCACHE_RELOAD_VALID / (PM_L2_LD / 2) * 100",
         "MetricGroup": "l2_stats",
         "MetricName": "l2_dem_ld_disp_percent"
     },
     {
-        "BriefDescription": "L2 Icache invalidates per run inst (per core)",
-        "MetricExpr": "(PM_L2_IC_INV / 2) / PM_RUN_INST_CMPL * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ic_inv_rate_percent"
-    },
-    {
-        "BriefDescription": "L2 Inst misses as a % of total L2 Inst dispatches (per thread)",
-        "MetricExpr": "PM_L2_INST_MISS / PM_L2_INST * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_inst_miss_ratio_percent"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L2 Load hits",
-        "MetricExpr": "(PM_L2_LD_HIT / PM_RUN_CYC) / 2",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_hit_frequency"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L2 Load misses",
-        "MetricExpr": "(PM_L2_LD_MISS / PM_RUN_CYC) / 2",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_miss_frequency"
-    },
-    {
-        "BriefDescription": "L2 Load misses as a % of total L2 Load dispatches (per thread)",
-        "MetricExpr": "PM_L2_LD_MISS / PM_L2_LD * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_miss_ratio_percent"
-    },
-    {
-        "BriefDescription": "% L2 load disp attempts Cache read Utilization (4 pclks per disp attempt)",
-        "MetricExpr": "((PM_L2_RCLD_DISP/2)*4)/ PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ld_rd_util"
-    },
-    {
-        "BriefDescription": "L2 load misses that require a cache write (4 pclks per disp attempt) % of pclks",
-        "MetricExpr": "((( PM_L2_LD_DISP - PM_L2_LD_HIT)/2)*4)/ PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_ldmiss_wr_util"
-    },
-    {
-        "BriefDescription": "L2 local pump prediction success",
-        "MetricExpr": "PM_L2_LOC_GUESS_CORRECT / (PM_L2_LOC_GUESS_CORRECT + PM_L2_LOC_GUESS_WRONG) * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_local_pred_correct_percent"
-    },
-    {
-        "BriefDescription": "L2 COs that were in M,Me,Mu state as a % of all L2 COs",
-        "MetricExpr": "PM_L2_CASTOUT_MOD / (PM_L2_CASTOUT_MOD + PM_L2_CASTOUT_SHR) * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_mod_co_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Load RC dispatch atampts that failed because of address collisions and cclass conflicts",
-        "MetricExpr": "(PM_L2_RCLD_DISP_FAIL_ADDR )/ PM_L2_RCLD_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_ld_disp_addr_fail_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Load RC dispatch attempts that failed",
-        "MetricExpr": "(PM_L2_RCLD_DISP_FAIL_ADDR + PM_L2_RCLD_DISP_FAIL_OTHER)/ PM_L2_RCLD_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_ld_disp_fail_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Store RC dispatch atampts that failed because of address collisions and cclass conflicts",
-        "MetricExpr": "PM_L2_RCST_DISP_FAIL_ADDR / PM_L2_RCST_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_st_disp_addr_fail_percent"
-    },
-    {
-        "BriefDescription": "% of L2 Store RC dispatch attempts that failed",
-        "MetricExpr": "(PM_L2_RCST_DISP_FAIL_ADDR + PM_L2_RCST_DISP_FAIL_OTHER)/ PM_L2_RCST_DISP * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rc_st_disp_fail_percent"
-    },
-    {
-        "BriefDescription": "L2 Cache Read Utilization (per core)",
-        "MetricExpr": "(((PM_L2_RCLD_DISP/2)*4)/ PM_RUN_CYC * 100) + (((PM_L2_RCST_DISP/2)*4)/PM_RUN_CYC * 100) + (((PM_L2_CASTOUT_MOD/2)*4)/PM_RUN_CYC * 100)",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_rd_util_percent"
-    },
-    {
-        "BriefDescription": "L2 COs that were in T,Te,Si,S state as a % of all L2 COs",
-        "MetricExpr": "PM_L2_CASTOUT_SHR / (PM_L2_CASTOUT_MOD + PM_L2_CASTOUT_SHR) * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_shr_co_percent"
-    },
-    {
         "BriefDescription": "L2 Store misses as a % of total L2 Store dispatches (per thread)",
         "MetricExpr": "PM_L2_ST_MISS / PM_L2_ST * 100",
         "MetricGroup": "l2_stats",
         "MetricName": "l2_st_miss_ratio_percent"
     },
     {
-        "BriefDescription": "% L2 store disp attempts Cache read Utilization (4 pclks per disp attempt)",
-        "MetricExpr": "((PM_L2_RCST_DISP/2)*4) / PM_RUN_CYC * 100",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_st_rd_util"
-    },
-    {
         "BriefDescription": "L2 stores that require a cache write (4 pclks per disp attempt) % of pclks",
         "MetricExpr": "((PM_L2_ST_DISP/2)*4) / PM_RUN_CYC * 100",
         "MetricGroup": "l2_stats",
         "MetricName": "l2_st_wr_util"
     },
     {
-        "BriefDescription": "L2 Cache Write Utilization (per core)",
-        "MetricExpr": "((((PM_L2_LD_DISP - PM_L2_LD_HIT)/2)*4) / PM_RUN_CYC * 100) + (((PM_L2_ST_DISP/2)*4) / PM_RUN_CYC * 100)",
-        "MetricGroup": "l2_stats",
-        "MetricName": "l2_wr_util_percent"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L3 Load hits",
-        "MetricExpr": "(PM_L3_LD_HIT / PM_RUN_CYC) / 2",
-        "MetricGroup": "l3_stats",
-        "MetricName": "l3_ld_hit_frequency"
-    },
-    {
-        "BriefDescription": "Average number of cycles between L3 Load misses",
-        "MetricExpr": "(PM_L3_LD_MISS / PM_RUN_CYC) / 2",
-        "MetricGroup": "l3_stats",
-        "MetricName": "l3_ld_miss_frequency"
-    },
-    {
-        "BriefDescription": "Average number of Write-in machines used. 1 of 8 WI machines is sampled every L3 cycle",
-        "MetricExpr": "(PM_L3_WI_USAGE / PM_RUN_CYC) * 8",
-        "MetricGroup": "l3_stats",
-        "MetricName": "l3_wi_usage"
-    },
-    {
         "BriefDescription": "Average icache miss latency",
         "MetricExpr": "PM_IC_DEMAND_CYC / PM_IC_DEMAND_REQ",
         "MetricGroup": "latency",
@@ -1823,7 +1691,7 @@
         "MetricName": "custom_secs"
     },
     {
-        "BriefDescription": "Percentage Cycles atleast one instruction dispatched",
+        "BriefDescription": "Percentage Cycles at least one instruction dispatched",
         "MetricExpr": "PM_1PLUS_PPC_DISP / PM_CYC * 100",
         "MetricName": "cycles_atleast_one_inst_dispatched_percent"
     },
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/cache.json b/tools/perf/pmu-events/arch/x86/amdzen1/cache.json
index 4ea7ec4f496e..0d46cb82bd52 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/cache.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/cache.json
@@ -38,31 +38,31 @@
     "EventName": "ic_fetch_stall.ic_stall_any",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_dq_empty",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_back_pressure",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ic_cache_inval.l2_invalidating_probe",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_cache_inval.fill_invalidated",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "bp_tlb_rel",
@@ -97,25 +97,25 @@
     "EventName": "l2_request_g1.change_to_x",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g1.prefetch_l2_cmd",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g1.l2_hw_pf",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g1.group2",
     "EventCode": "0x60",
     "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_request_g1.all_no_prefetch",
@@ -150,31 +150,31 @@
     "EventName": "l2_request_g2.ic_rd_sized_nc",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g2.smc_inval",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g2.bus_locks_originator",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g2.bus_locks_responses",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_latency.l2_cycles_waiting_on_fills",
     "EventCode": "0x62",
     "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_wcb_req.wcb_write",
@@ -192,13 +192,13 @@
     "EventName": "l2_wcb_req.zero_byte_store",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_wcb_req.cl_zero",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
@@ -228,37 +228,37 @@
     "EventName": "l2_cache_req_stat.ls_rd_blk_c",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_x",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_s",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit clean line in L2.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_miss",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ic_access_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).",
-    "UMask": "0x9"
+    "UMask": "0x09"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
@@ -270,12 +270,12 @@
     "EventName": "l2_fill_pending.l2_fill_busy",
     "EventCode": "0x6d",
     "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_pf_hit_l2",
     "EventCode": "0x70",
-    "BriefDescription": "L2 prefetch hit in L2.",
+    "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.",
     "UMask": "0xff"
   },
   {
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/core.json b/tools/perf/pmu-events/arch/x86/amdzen1/core.json
index 653b11b23399..4dceeabc4a9f 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/core.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/core.json
@@ -68,21 +68,21 @@
     "EventCode": "0xcb",
     "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.mmx_instr",
     "EventCode": "0xcb",
     "BriefDescription": "MMX instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.x87_instr",
     "EventCode": "0xcb",
     "BriefDescription": "x87 instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_cond",
@@ -103,19 +103,19 @@
     "EventName": "ex_tagged_ibs_ops.ibs_count_rollover",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_fus_brnch_inst",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json
index a35542bd3b36..3995b528ebd6 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/floating-point.json
@@ -39,35 +39,35 @@
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to all fpu pipes.",
     "PublicDescription": "The number of operations (uOps) and dual-pipe uOps dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to all pipes.",
-    "UMask": "0xf"
+    "UMask": "0x0f"
   },
   {
     "EventName": "fpu_pipe_assignment.total3",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps on pipe 3.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fpu_pipe_assignment.total2",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps on pipe 2.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fpu_pipe_assignment.total1",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps on pipe 1.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fpu_pipe_assignment.total0",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps  on pipe 0.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_sched_empty",
@@ -79,28 +79,28 @@
     "EventCode": "0x02",
     "BriefDescription": "All Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8.",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "EventName": "fp_retx87_fp_ops.div_sqr_r_ops",
     "EventCode": "0x02",
     "BriefDescription": "Divide and square root Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Divide and square root Ops.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_retx87_fp_ops.mul_ops",
     "EventCode": "0x02",
     "BriefDescription": "Multiply Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Multiply Ops.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_retx87_fp_ops.add_sub_ops",
     "EventCode": "0x02",
     "BriefDescription": "Add/subtract Ops.",
     "PublicDescription": "The number of x87 floating-point Ops that have retired. The number of events logged per cycle can vary from 0 to 8. Add/subtract Ops.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.all",
@@ -142,83 +142,83 @@
     "EventCode": "0x03",
     "BriefDescription": "Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single precision multiply-add FLOPS. Multiply-add counts as 2 FLOPS.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.sp_div_flops",
     "EventCode": "0x03",
     "BriefDescription": "Single-precision divide/square root FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision divide/square root FLOPS.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.sp_mult_flops",
     "EventCode": "0x03",
     "BriefDescription": "Single-precision multiply FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision multiply FLOPS.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.sp_add_sub_flops",
     "EventCode": "0x03",
     "BriefDescription": "Single-precision add/subtract FLOPS.",
     "PublicDescription": "This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15. Single-precision add/subtract FLOPS.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.optimized",
     "EventCode": "0x04",
     "BriefDescription": "Number of Scalar Ops optimized.",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Scalar Ops optimized.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.opt_potential",
     "EventCode": "0x04",
     "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass).",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of Ops that are candidates for optimization (have Z-bit either set or pass).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops eliminated.",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops eliminated.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops.",
     "PublicDescription": "This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes. Number of SSE Move Ops.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.",
     "PublicDescription": "The number of serializing Ops retired. x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 bottom-executing uOps retired.",
     "PublicDescription": "The number of serializing Ops retired. x87 bottom-executing uOps retired.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.",
     "PublicDescription": "The number of serializing Ops retired. SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "SSE bottom-executing uOps retired.",
     "PublicDescription": "The number of serializing Ops retired. SSE bottom-executing uOps retired.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/memory.json b/tools/perf/pmu-events/arch/x86/amdzen1/memory.json
index b33a3c308019..385022fb026e 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/memory.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/memory.json
@@ -3,25 +3,25 @@
     "EventName": "ls_locks.bus_lock",
     "EventCode": "0x25",
     "BriefDescription": "Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_dispatch.ld_st_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Counts the number of operations dispatched to the LS unit. Unit Masks ADDed. Load-op-Stores.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_dispatch.store_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Counts the number of stores dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_dispatch.ld_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Counts the number of loads dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_stlf",
@@ -37,13 +37,13 @@
     "EventName": "ls_mab_alloc.dc_prefetcher",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB allocates by type - DC prefetcher.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_mab_alloc.stores",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB allocates by type - stores.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_mab_alloc.loads",
@@ -85,61 +85,61 @@
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 1G size.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 2M size.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_32k_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 32K size.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Reload of a page of 4K size.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_tablewalker.iside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on I-side.",
-    "UMask": "0xc"
+    "UMask": "0x0c"
   },
   {
     "EventName": "ls_tablewalker.ic_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 1.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_tablewalker.ic_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 0.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_tablewalker.dside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on D-side.",
-    "UMask": "0x3"
+    "UMask": "0x03"
   },
   {
     "EventName": "ls_tablewalker.dc_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_tablewalker.dc_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_misal_accesses",
@@ -150,31 +150,31 @@
     "EventName": "ls_pref_instr_disp.prefetch_nta",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions (PREFETCHNTA instruction) Dispatched.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_pref_instr_disp.store_prefetch_w",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions (3DNow PREFETCHW instruction) Dispatched.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_pref_instr_disp.load_prefetch_w",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched. Prefetch, Prefetch_T0_T1_T2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_inef_sw_pref.mab_mch_cnt",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_not_halted_cyc",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/other.json b/tools/perf/pmu-events/arch/x86/amdzen1/other.json
index ff780098d36e..7626986ce1fb 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/other.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/other.json
@@ -3,13 +3,13 @@
     "EventName": "ic_oc_mode_switch.oc_ic_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. OC to IC mode switch.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_oc_mode_switch.ic_oc_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. IC to OC mode switch.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.retire_token_stall",
@@ -33,24 +33,24 @@
     "EventName": "de_dis_dispatch_token_stalls0.alsq3_0_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 3_0 Tokens unavailable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq3_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 3 Tokens unavailable.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq2_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 2 Tokens unavailable.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq1_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 1 Tokens unavailable.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json
index 2cfe2d2f3bfd..bf5083c1c260 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json
@@ -10,7 +10,7 @@
     "EventName": "all_dc_accesses",
     "EventCode": "0x29",
     "BriefDescription": "All L1 Data Cache Accesses",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "MetricName": "all_l2_cache_accesses",
@@ -79,10 +79,10 @@
     "UMask": "0x70"
   },
   {
-    "MetricName": "l2_cache_hits_from_l2_hwpf",
+    "EventName": "l2_cache_hits_from_l2_hwpf",
+    "EventCode": "0x70",
     "BriefDescription": "L2 Cache Hits from L2 HWPF",
-    "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
-    "MetricGroup": "l2_cache"
+    "UMask": "0xff"
   },
   {
     "EventName": "l3_accesses",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/branch.json b/tools/perf/pmu-events/arch/x86/amdzen2/branch.json
index ef4166a66288..84fb43fa59ad 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/branch.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/branch.json
@@ -24,25 +24,25 @@
     "EventName": "bp_l1_tlb_fetch_hit",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB.",
-    "UMask": "0xFF"
+    "UMask": "0xff"
   },
   {
     "EventName": "bp_l1_tlb_fetch_hit.if1g",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 1GB page.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "bp_l1_tlb_fetch_hit.if2m",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 2MB page.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "bp_l1_tlb_fetch_hit.if4k",
     "EventCode": "0x94",
     "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. Instruction fetches to a 4KB page.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "bp_tlb_rel",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json
index f61b982f83ca..c858fb9477e3 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/cache.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/cache.json
@@ -27,25 +27,25 @@
     "EventName": "l2_request_g1.change_to_x",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g1.prefetch_l2_cmd",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g1.l2_hw_pf",
     "EventCode": "0x60",
     "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g1.group2",
     "EventCode": "0x60",
     "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_request_g1.all_no_prefetch",
@@ -80,31 +80,31 @@
     "EventName": "l2_request_g2.ic_rd_sized_nc",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_request_g2.smc_inval",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_request_g2.bus_locks_originator",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_request_g2.bus_locks_responses",
     "EventCode": "0x61",
     "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_latency.l2_cycles_waiting_on_fills",
     "EventCode": "0x62",
     "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_wcb_req.wcb_write",
@@ -122,13 +122,13 @@
     "EventName": "l2_wcb_req.zero_byte_store",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_wcb_req.cl_zero",
     "EventCode": "0x63",
     "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
@@ -158,37 +158,37 @@
     "EventName": "l2_cache_req_stat.ls_rd_blk_c",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_x",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_hit_s",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit clean line in L2.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "l2_cache_req_stat.ic_fill_miss",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_cache_req_stat.ic_access_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
     "EventCode": "0x64",
     "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).",
-    "UMask": "0x9"
+    "UMask": "0x09"
   },
   {
     "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
@@ -200,12 +200,12 @@
     "EventName": "l2_fill_pending.l2_fill_busy",
     "EventCode": "0x6d",
     "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l2_pf_hit_l2",
     "EventCode": "0x70",
-    "BriefDescription": "L2 prefetch hit in L2.",
+    "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.",
     "UMask": "0xff"
   },
   {
@@ -255,19 +255,19 @@
     "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g",
     "EventCode": "0x85",
     "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 1GB page.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m",
     "EventCode": "0x85",
     "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 2MB page.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k",
     "EventCode": "0x85",
     "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs. Instruction fetches to a 4KB page.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "bp_snp_re_sync",
@@ -278,43 +278,43 @@
     "EventName": "ic_fetch_stall.ic_stall_any",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_dq_empty",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_fetch_stall.ic_stall_back_pressure",
     "EventCode": "0x87",
     "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ic_cache_inval.l2_invalidating_probe",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_cache_inval.fill_invalidated",
     "EventCode": "0x8c",
     "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ic_oc_mode_switch.oc_ic_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. OC to IC mode switch.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ic_oc_mode_switch.ic_oc_mode_switch",
     "EventCode": "0x28a",
     "BriefDescription": "OC Mode Switch. IC to OC mode switch.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "l3_request_g1.caching_l3_cache_accesses",
@@ -353,7 +353,7 @@
   },
   {
     "EventName": "xi_ccx_sdp_req1.all_l3_miss_req_typs",
-    "EventCode": "0x9A",
+    "EventCode": "0x9a",
     "BriefDescription": "All L3 Miss Request Types. Ignores SliceMask and ThreadMask.",
     "UMask": "0x3f",
     "Unit": "L3PMC"
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/core.json b/tools/perf/pmu-events/arch/x86/amdzen2/core.json
index 4b75183da94a..bed14829f0bc 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/core.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/core.json
@@ -68,21 +68,21 @@
     "EventCode": "0xcb",
     "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.mmx_instr",
     "EventCode": "0xcb",
     "BriefDescription": "MMX instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_ret_mmx_fp_instr.x87_instr",
     "EventCode": "0xcb",
     "BriefDescription": "x87 instructions.",
     "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_cond",
@@ -108,19 +108,19 @@
     "EventName": "ex_tagged_ibs_ops.ibs_count_rollover",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops",
     "EventCode": "0x1cf",
     "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ex_ret_fus_brnch_inst",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json
index 622a0c420e46..91ed96f2580b 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/floating-point.json
@@ -4,35 +4,35 @@
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps.",
     "PublicDescription": "Total number of fp uOps. The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS.",
-    "UMask": "0xf"
+    "UMask": "0x0f"
   },
   {
     "EventName": "fpu_pipe_assignment.total3",
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to pipe 3.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fpu_pipe_assignment.total2",
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to pipe 2.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fpu_pipe_assignment.total1",
     "EventCode": "0x00",
     "BriefDescription": "Total number uOps assigned to pipe 1.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fpu_pipe_assignment.total0",
     "EventCode": "0x00",
     "BriefDescription": "Total number of fp uOps  on pipe 0.",
     "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.all",
@@ -45,96 +45,96 @@
     "EventCode": "0x03",
     "BriefDescription": "Multiply-add FLOPS. Multiply-add counts as 2 FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
     "PublicDescription": "",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.div_flops",
     "EventCode": "0x03",
     "BriefDescription": "Divide/square root FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.mult_flops",
     "EventCode": "0x03",
     "BriefDescription": "Multiply FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_ret_sse_avx_ops.add_sub_flops",
     "EventCode": "0x03",
     "BriefDescription": "Add/subtract FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.optimized",
     "EventCode": "0x04",
     "BriefDescription": "Number of Scalar Ops optimized. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.opt_potential",
     "EventCode": "0x04",
     "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass). This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops eliminated. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops",
     "EventCode": "0x04",
     "BriefDescription": "Number of SSE Move Ops. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "SSE bottom-executing uOps retired. The number of serializing Ops retired.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_retired_ser_ops.sse_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "The number of serializing Ops retired. SSE control word mispredict traps due to mispredictions in RC, FTZ or DAZ, or changes in mask bits.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_bot_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 bottom-executing uOps retired. The number of serializing Ops retired.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_retired_ser_ops.x87_ctrl_ret",
     "EventCode": "0x05",
     "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits. The number of serializing Ops retired.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "fp_disp_faults.ymm_spill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. YMM spill fault.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "fp_disp_faults.ymm_fill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. YMM fill fault.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "fp_disp_faults.xmm_fill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. XMM fill fault.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "fp_disp_faults.x87_fill_fault",
     "EventCode": "0x0e",
     "BriefDescription": "Floating Point Dispatch Faults. x87 fill fault.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/memory.json b/tools/perf/pmu-events/arch/x86/amdzen2/memory.json
index 715046b339cb..89822b9ddb79 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/memory.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/memory.json
@@ -4,31 +4,31 @@
     "EventCode": "0x24",
     "BriefDescription": "Non-forwardable conflict; used to reduce STLI's via software. All reasons. Store To Load Interlock (STLI) are loads that were unable to complete because of a possible match with an older store, and the older store could not do STLF for some reason.",
     "PublicDescription" : "Store-to-load conflicts: A load was unable to complete due to a non-forwardable conflict with an older store. Most commonly, a load's address range partially but not completely overlaps with an uncompleted older store. Software can avoid this problem by using same-size and same-alignment loads and stores when accessing the same data. Vector/SIMD code is particularly susceptible to this problem; software should construct wide vector stores by manipulating vector elements in registers using shuffle/blend/swap instructions prior to storing to memory, instead of using narrow element-by-element stores.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_locks.spec_lock_hi_spec",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. High speculative cacheable lock speculation succeeded.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_locks.spec_lock_lo_spec",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. Low speculative cacheable lock speculation succeeded.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_locks.non_spec_lock",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. Non-speculative lock succeeded.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_locks.bus_lock",
     "EventCode": "0x25",
     "BriefDescription": "Retired lock instructions. Bus lock when a locked operations crosses a cache boundary or is done on an uncacheable memory type. Comparable to legacy bus lock.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_ret_cl_flush",
@@ -44,33 +44,33 @@
     "EventName": "ls_dispatch.ld_st_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Dispatch of a single op that performs a load from and store to the same memory address. Number of single ops that do load/store to an address.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_dispatch.store_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Number of stores dispatched. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_dispatch.ld_dispatch",
     "EventCode": "0x29",
     "BriefDescription": "Number of loads dispatched. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_smi_rx",
-    "EventCode": "0x2B",
+    "EventCode": "0x2b",
     "BriefDescription": "Number of SMIs received."
   },
   {
     "EventName": "ls_int_taken",
-    "EventCode": "0x2C",
+    "EventCode": "0x2c",
     "BriefDescription": "Number of interrupts taken."
   },
   {
     "EventName": "ls_rdtsc",
-    "EventCode": "0x2D",
+    "EventCode": "0x2d",
     "BriefDescription": "Number of reads of the TSC (RDTSC instructions). The count is speculative."
   },
   {
@@ -93,19 +93,19 @@
     "EventName": "ls_mab_alloc.dc_prefetcher",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB Allocates by Type. DC prefetcher.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_mab_alloc.stores",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB Allocates by Type. Stores.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_mab_alloc.loads",
     "EventCode": "0x41",
     "BriefDescription": "LS MAB Allocates by Type. Loads.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_refills_from_sys.ls_mabresp_rmt_dram",
@@ -123,19 +123,19 @@
     "EventName": "ls_refills_from_sys.ls_mabresp_lcl_dram",
     "EventCode": "0x43",
     "BriefDescription": "Demand Data Cache Fills by Data Source. DRAM or IO from this thread's die.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_refills_from_sys.ls_mabresp_lcl_cache",
     "EventCode": "0x43",
     "BriefDescription": "Demand Data Cache Fills by Data Source. Hit in cache; local CCX (not Local L2), or Remote CCX and the address's Home Node is on this thread's die.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_refills_from_sys.ls_mabresp_lcl_l2",
     "EventCode": "0x43",
     "BriefDescription": "Demand Data Cache Fills by Data Source. Local L2 hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.all",
@@ -171,61 +171,61 @@
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that hit in the L2 TLB.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that hit in the L2 TLB.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload hit a coalesced page.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
     "EventCode": "0x45",
     "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that hit in the L2 TLB.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_tablewalker.iside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on I-side.",
-    "UMask": "0xc"
+    "UMask": "0x0c"
   },
   {
     "EventName": "ls_tablewalker.ic_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 1.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_tablewalker.ic_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks IC Type 0.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_tablewalker.dside",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks on D-side.",
-    "UMask": "0x3"
+    "UMask": "0x03"
   },
   {
     "EventName": "ls_tablewalker.dc_type1",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 1.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_tablewalker.dc_type0",
     "EventCode": "0x46",
     "BriefDescription": "Total Page Table Walks DC Type 0.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_misal_accesses",
@@ -242,31 +242,31 @@
     "EventName": "ls_pref_instr_disp.prefetch_nta",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchNTA instruction. See docAPM3 PREFETCHlevel.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "ls_pref_instr_disp.prefetch_w",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). See docAPM3 PREFETCHW.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_pref_instr_disp.prefetch",
     "EventCode": "0x4b",
     "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). Prefetch_T0_T1_T2. PrefetchT0, T1 and T2 instructions. See docAPM3 PREFETCHlevel.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_inef_sw_pref.mab_mch_cnt",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit",
     "EventCode": "0x52",
     "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_rmt_dram",
@@ -284,49 +284,49 @@
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_dram",
     "EventCode": "0x59",
     "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. DRAM or IO from this thread's die.  From DRAM (home node local).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_cache",
     "EventCode": "0x59",
     "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From another cache (home node local).",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_sw_pf_dc_fill.ls_mabresp_lcl_l2",
     "EventCode": "0x59",
     "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. Local L2 hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_rmt_dram",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM (home node remote).",
     "UMask": "0x40"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_rmt_cache",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From another cache (home node remote).",
     "UMask": "0x10"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_dram",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM (home node local).",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_cache",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From another cache (home node local).",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "ls_hw_pf_dc_fill.ls_mabresp_lcl_l2",
-    "EventCode": "0x5A",
+    "EventCode": "0x5a",
     "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. Local L2 hit.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "ls_not_halted_cyc",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/other.json b/tools/perf/pmu-events/arch/x86/amdzen2/other.json
index e94994d4a60e..1bdf106ca785 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/other.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/other.json
@@ -14,13 +14,13 @@
     "EventName": "de_dis_uops_from_decoder.opcache_dispatched",
     "EventCode": "0xaa",
     "BriefDescription": "Count of dispatched Ops from OpCache.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_uops_from_decoder.decoder_dispatched",
     "EventCode": "0xaa",
     "BriefDescription": "Count of dispatched Ops from Decoder.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.fp_misc_rsrc_stall",
@@ -50,25 +50,25 @@
     "EventName": "de_dis_dispatch_token_stalls1.int_sched_misc_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Scheduler miscellaneous resource stall.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.store_queue_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Store queue resource stall. Applies to all ops with store semantics.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.load_queue_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Load queue resource stall. Applies to all ops with load semantics.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls1.int_phy_reg_file_token_stall",
     "EventCode": "0xae",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Physical Register File resource stall. Applies to all ops that have an integer destination register.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.sc_agu_dispatch_stall",
@@ -92,24 +92,24 @@
     "EventName": "de_dis_dispatch_token_stalls0.alu_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALU tokens total unavailable.",
-    "UMask": "0x8"
+    "UMask": "0x08"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq3_0_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ3_0_TokenStall.",
-    "UMask": "0x4"
+    "UMask": "0x04"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq2_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 2 Tokens unavailable.",
-    "UMask": "0x2"
+    "UMask": "0x02"
   },
   {
     "EventName": "de_dis_dispatch_token_stalls0.alsq1_token_stall",
     "EventCode": "0xaf",
     "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. ALSQ 1 Tokens unavailable.",
-    "UMask": "0x1"
+    "UMask": "0x01"
   }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json
index 2ef91e25e661..a71694a043ba 100644
--- a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json
+++ b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json
@@ -10,7 +10,7 @@
     "EventName": "all_dc_accesses",
     "EventCode": "0x29",
     "BriefDescription": "All L1 Data Cache Accesses",
-    "UMask": "0x7"
+    "UMask": "0x07"
   },
   {
     "MetricName": "all_l2_cache_accesses",
@@ -79,10 +79,10 @@
     "UMask": "0x70"
   },
   {
-    "MetricName": "l2_cache_hits_from_l2_hwpf",
+    "EventName": "l2_cache_hits_from_l2_hwpf",
+    "EventCode": "0x70",
     "BriefDescription": "L2 Cache Hits from L2 HWPF",
-    "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
-    "MetricGroup": "l2_cache"
+    "UMask": "0xff"
   },
   {
     "EventName": "l3_accesses",
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/branch.json b/tools/perf/pmu-events/arch/x86/amdzen3/branch.json
new file mode 100644
index 000000000000..018a7fe94fb9
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/branch.json
@@ -0,0 +1,53 @@
+[
+  {
+    "EventName": "bp_l1_btb_correct",
+    "EventCode": "0x8a",
+    "BriefDescription": "L1 Branch Prediction Overrides Existing Prediction (speculative)."
+  },
+  {
+    "EventName": "bp_l2_btb_correct",
+    "EventCode": "0x8b",
+    "BriefDescription": "L2 Branch Prediction Overrides Existing Prediction (speculative)."
+  },
+  {
+    "EventName": "bp_dyn_ind_pred",
+    "EventCode": "0x8e",
+    "BriefDescription": "Dynamic Indirect Predictions.",
+    "PublicDescription": "The number of times a branch used the indirect predictor to make a prediction."
+  },
+  {
+    "EventName": "bp_de_redirect",
+    "EventCode": "0x91",
+    "BriefDescription": "Decode Redirects",
+    "PublicDescription": "The number of times the instruction decoder overrides the predicted target."
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if1g",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instruction TLB hit (1G page size).",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if2m",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instruction TLB hit (2M page size).",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_fetch_hit.if4k",
+    "EventCode": "0x94",
+    "BriefDescription": "The number of instruction fetches that hit in the L1 ITLB. L1 Instrcution TLB hit (4K or 16K page size).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_tlb_rel",
+    "EventCode": "0x99",
+    "BriefDescription": "The number of ITLB reload requests."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/cache.json b/tools/perf/pmu-events/arch/x86/amdzen3/cache.json
new file mode 100644
index 000000000000..fa1d7499a2e3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/cache.json
@@ -0,0 +1,402 @@
+[
+  {
+    "EventName": "l2_request_g1.rd_blk_l",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache reads (including hardware and software prefetch).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_request_g1.rd_blk_x",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache stores.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g1.ls_rd_blk_c_s",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache shared reads.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g1.cacheable_ic_read",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Instruction cache reads.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_request_g1.change_to_x",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). Data cache state change requests. Request change to writable, check L2 for current state.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_request_g1.prefetch_l2_cmd",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). PrefetchL2Cmd.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_request_g1.l2_hw_pf",
+    "EventCode": "0x60",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 1 - Common). L2 Prefetcher. All prefetches accepted by L2 pipeline, hit or miss. Types of PF and L2 hit/miss broken out in a separate perfmon event.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_request_g1.group2",
+    "EventCode": "0x60",
+    "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g2 (PMCx061).",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_request_g1.all_no_prefetch",
+    "EventCode": "0x60",
+    "UMask": "0xf9"
+  },
+  {
+    "EventName": "l2_request_g2.group1",
+    "EventCode": "0x61",
+    "BriefDescription": "Miscellaneous events covered in more detail by l2_request_g1 (PMCx060).",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Data cache read sized.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_request_g2.ls_rd_sized_nc",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Data cache read sized non-cacheable.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_request_g2.ic_rd_sized",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_request_g2.ic_rd_sized_nc",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Instruction cache read sized non-cacheable.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_request_g2.smc_inval",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Self-modifying code invalidates.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_request_g2.bus_locks_originator",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus locks.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_request_g2.bus_locks_responses",
+    "EventCode": "0x61",
+    "BriefDescription": "All L2 Cache Requests (Breakdown 2 - Rare). Bus lock response.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_latency.l2_cycles_waiting_on_fills",
+    "EventCode": "0x62",
+    "BriefDescription": "Total cycles spent waiting for L2 fills to complete from L3 or memory, divided by four. Event counts are for both threads. To calculate average latency, the number of fills from both threads must be used.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_wcb_req.wcb_write",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB write requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) write requests.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_wcb_req.wcb_close",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB close requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) close requests.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_wcb_req.zero_byte_store",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB zero byte store requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) zero byte store requests.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_wcb_req.cl_zero",
+    "EventCode": "0x63",
+    "BriefDescription": "LS to L2 WCB cache line zeroing requests. LS (Load/Store unit) to L2 WCB (Write Combining Buffer) cache line zeroing requests.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_cs",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache shared read hit in L2",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache read hit in L2. Modifiable.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_l_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache read hit non-modifiable line in L2.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache store or state change hit in L2.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ls_rd_blk_c",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Data cache request miss in L2 (all types). Use l2_cache_misses_from_dc_misses instead.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_x",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit modifiable line in L2.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_hit_s",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache hit non-modifiable line in L2.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_fill_miss",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2. Use l2_cache_misses_from_ic_miss instead.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_access_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache requests in L2.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_miss_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request miss in L2 and Data cache request miss in L2 (all types).",
+    "UMask": "0x09"
+  },
+  {
+    "EventName": "l2_cache_req_stat.ic_dc_hit_in_l2",
+    "EventCode": "0x64",
+    "BriefDescription": "Core to L2 cacheable request access status (not including L2 Prefetch). Instruction cache request hit in L2 and Data cache request hit in L2 (all types).",
+    "UMask": "0xf6"
+  },
+  {
+    "EventName": "l2_fill_pending.l2_fill_busy",
+    "EventCode": "0x6d",
+    "BriefDescription": "Cycles with fill pending from L2. Total cycles spent with one or more fill requests in flight from L2.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_pf_hit_l2",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 prefetch hit in L2. Use l2_cache_hits_from_l2_hwpf instead.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_hit_l3",
+    "EventCode": "0x71",
+    "BriefDescription": "L2 prefetcher hits in L3. Counts all L2 prefetches accepted by the L2 pipeline which miss the L2 cache and hit the L3.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_pf_miss_l2_l3",
+    "EventCode": "0x72",
+    "BriefDescription": "L2 prefetcher misses in L3. Counts all L2 prefetches accepted by the L2 pipeline which miss the L2 and the L3 caches.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ic_fw32",
+    "EventCode": "0x80",
+    "BriefDescription": "The number of 32B fetch windows transferred from IC pipe to DE instruction decoder (includes non-cacheable and cacheable fill responses)."
+  },
+  {
+    "EventName": "ic_fw32_miss",
+    "EventCode": "0x81",
+    "BriefDescription": "The number of 32B fetch windows tried to read the L1 IC and missed in the full tag."
+  },
+  {
+    "EventName": "ic_cache_fill_l2",
+    "EventCode": "0x82",
+    "BriefDescription": "Instruction Cache Refills from L2. The number of 64 byte instruction cache line was fulfilled from the L2 cache."
+  },
+  {
+    "EventName": "ic_cache_fill_sys",
+    "EventCode": "0x83",
+    "BriefDescription": "Instruction Cache Refills from System. The number of 64 byte instruction cache line fulfilled from system memory or another cache."
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_hit",
+    "EventCode": "0x84",
+    "BriefDescription": "L1 ITLB Miss, L2 ITLB Hit. The number of instruction fetches that miss in the L1 ITLB but hit in the L2 ITLB."
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of instruction fetches that miss in both the L1 and L2 TLBs.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.coalesced_4k",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for >4K Coalesced page.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if1g",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for 1G page.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if2m",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk for 2M page.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "bp_l1_tlb_miss_l2_tlb_miss.if4k",
+    "EventCode": "0x85",
+    "BriefDescription": "The number of valid fills into the ITLB originating from the LS Page-Table Walker. Tablewalk requests are issued for L1-ITLB and L2-ITLB misses. Walk to 4K page.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "bp_snp_re_sync",
+    "EventCode": "0x86",
+    "BriefDescription": "The number of pipeline restarts caused by invalidating probes that hit on the instruction stream currently being executed. This would happen if the active instruction stream was being modified by another processor in an MP system - typically a highly unlikely event."
+  },
+  {
+    "EventName": "ic_fetch_stall.ic_stall_any",
+    "EventCode": "0x87",
+    "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle for any reason (nothing valid in pipe ICM1).",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ic_fetch_stall.ic_stall_dq_empty",
+    "EventCode": "0x87",
+    "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to DQ empty.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_fetch_stall.ic_stall_back_pressure",
+    "EventCode": "0x87",
+    "BriefDescription": "Instruction Pipe Stall. IC pipe was stalled during this clock cycle (including IC to OC fetches) due to back-pressure.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ic_cache_inval.l2_invalidating_probe",
+    "EventCode": "0x8c",
+    "BriefDescription": "IC line invalidated due to L2 invalidating probe (external or LS). The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_cache_inval.fill_invalidated",
+    "EventCode": "0x8c",
+    "BriefDescription": "IC line invalidated due to overwriting fill response. The number of instruction cache lines invalidated. A non-SMC event is CMC (cross modifying code), either from the other thread of the core or another core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ic_tag_hit_miss.all_instruction_cache_accesses",
+    "EventCode": "0x18e",
+    "BriefDescription": "All Instruction Cache Accesses. Counts various IC tag related hit and miss events.",
+    "UMask": "0x1f"
+  },
+  {
+    "EventName": "ic_tag_hit_miss.instruction_cache_miss",
+    "EventCode": "0x18e",
+    "BriefDescription": "Instruction Cache Miss. Counts various IC tag related hit and miss events.",
+    "UMask": "0x18"
+  },
+  {
+    "EventName": "ic_tag_hit_miss.instruction_cache_hit",
+    "EventCode": "0x18e",
+    "BriefDescription": "Instruction Cache Hit. Counts various IC tag related hit and miss events.",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "ic_oc_mode_switch.oc_ic_mode_switch",
+    "EventCode": "0x28a",
+    "BriefDescription": "OC Mode Switch. OC to IC mode switch.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ic_oc_mode_switch.ic_oc_mode_switch",
+    "EventCode": "0x28a",
+    "BriefDescription": "OC Mode Switch. IC to OC mode switch.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "op_cache_hit_miss.all_op_cache_accesses",
+    "EventCode": "0x28f",
+    "BriefDescription": "All Op Cache accesses. Counts Op Cache micro-tag hit/miss events",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "op_cache_hit_miss.op_cache_miss",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op Cache Miss. Counts Op Cache micro-tag hit/miss events",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "op_cache_hit_miss.op_cache_hit",
+    "EventCode": "0x28f",
+    "BriefDescription": "Op Cache Hit. Counts Op Cache micro-tag hit/miss events",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "l3_request_g1.caching_l3_cache_accesses",
+    "EventCode": "0x01",
+    "BriefDescription": "Caching: L3 cache accesses",
+    "UMask": "0x80",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_lookup_state.all_l3_req_typs",
+    "EventCode": "0x04",
+    "BriefDescription": "All L3 Request Types. All L3 cache Requests",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_comb_clstr_state.other_l3_miss_typs",
+    "EventCode": "0x06",
+    "BriefDescription": "Other L3 Miss Request Types",
+    "UMask": "0xfe",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_comb_clstr_state.request_miss",
+    "EventCode": "0x06",
+    "BriefDescription": "L3 cache misses",
+    "UMask": "0x01",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "xi_sys_fill_latency",
+    "EventCode": "0x90",
+    "BriefDescription": "L3 Cache Miss Latency. Total cycles for all transactions divided by 16. Ignores SliceMask and ThreadMask.",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "xi_ccx_sdp_req1",
+    "EventCode": "0x9a",
+    "BriefDescription": "L3 Misses by Request Type. Ignores SliceID, EnAllSlices, CoreID, EnAllCores and ThreadMask. Requires unit mask 0xFF to engage event for counting.",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/core.json b/tools/perf/pmu-events/arch/x86/amdzen3/core.json
new file mode 100644
index 000000000000..4e27a2be359e
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/core.json
@@ -0,0 +1,137 @@
+[
+  {
+    "EventName": "ex_ret_instr",
+    "EventCode": "0xc0",
+    "BriefDescription": "Retired Instructions."
+  },
+  {
+    "EventName": "ex_ret_ops",
+    "EventCode": "0xc1",
+    "BriefDescription": "Retired Ops. Use macro_ops_retired instead.",
+    "PublicDescription": "The number of macro-ops retired."
+  },
+  {
+    "EventName": "ex_ret_brn",
+    "EventCode": "0xc2",
+    "BriefDescription": "Retired Branch Instructions.",
+    "PublicDescription": "The number of branch instructions retired. This includes all types of architectural control flow changes, including exceptions and interrupts."
+  },
+  {
+    "EventName": "ex_ret_brn_misp",
+    "EventCode": "0xc3",
+    "BriefDescription": "Retired Branch Instructions Mispredicted.",
+    "PublicDescription": "The number of retired branch instructions, that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn",
+    "EventCode": "0xc4",
+    "BriefDescription": "Retired Taken Branch Instructions.",
+    "PublicDescription": "The number of taken branches that were retired. This includes all types of architectural control flow changes, including exceptions and interrupts."
+  },
+  {
+    "EventName": "ex_ret_brn_tkn_misp",
+    "EventCode": "0xc5",
+    "BriefDescription": "Retired Taken Branch Instructions Mispredicted.",
+    "PublicDescription": "The number of retired taken branch instructions that were mispredicted."
+  },
+  {
+    "EventName": "ex_ret_brn_far",
+    "EventCode": "0xc6",
+    "BriefDescription": "Retired Far Control Transfers.",
+    "PublicDescription": "The number of far control transfers retired including far call/jump/return, IRET, SYSCALL and SYSRET, plus exceptions and interrupts. Far control transfers are not subject to branch prediction."
+  },
+  {
+    "EventName": "ex_ret_brn_resync",
+    "EventCode": "0xc7",
+    "BriefDescription": "Retired Branch Resyncs.",
+    "PublicDescription": "The number of resync branches. These reflect pipeline restarts due to certain microcode assists and events such as writes to the active instruction stream, among other things. Each occurrence reflects a restart penalty similar to a branch mispredict. This is relatively rare."
+  },
+  {
+    "EventName": "ex_ret_near_ret",
+    "EventCode": "0xc8",
+    "BriefDescription": "Retired Near Returns.",
+    "PublicDescription": "The number of near return instructions (RET or RET Iw) retired."
+  },
+  {
+    "EventName": "ex_ret_near_ret_mispred",
+    "EventCode": "0xc9",
+    "BriefDescription": "Retired Near Returns Mispredicted.",
+    "PublicDescription": "The number of near returns retired that were not correctly predicted by the return address predictor. Each such mispredict incurs the same penalty as a mispredicted conditional branch instruction."
+  },
+  {
+    "EventName": "ex_ret_brn_ind_misp",
+    "EventCode": "0xca",
+    "BriefDescription": "Retired Indirect Branch Instructions Mispredicted.",
+    "PublicDescription": "The number of indirect branches retired that were not correctly predicted. Each such mispredict incurs the same penalty as a mispredicted conditional branch instruction. Note that only EX mispredicts are counted."
+  },
+  {
+    "EventName": "ex_ret_mmx_fp_instr.sse_instr",
+    "EventCode": "0xcb",
+    "BriefDescription": "SSE instructions (SSE, SSE2, SSE3, SSSE3, SSE4A, SSE41, SSE42, AVX).",
+    "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_ret_mmx_fp_instr.mmx_instr",
+    "EventCode": "0xcb",
+    "BriefDescription": "MMX instructions.",
+    "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. MMX instructions.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_ret_mmx_fp_instr.x87_instr",
+    "EventCode": "0xcb",
+    "BriefDescription": "x87 instructions.",
+    "PublicDescription": "The number of MMX, SSE or x87 instructions retired. The UnitMask allows the selection of the individual classes of instructions as given in the table. Each increment represents one complete instruction. Since this event includes non-numeric instructions it is not suitable for measuring MFLOPS. x87 instructions.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_ret_ind_brch_instr",
+    "EventCode": "0xcc",
+    "BriefDescription": "Retired Indirect Branch Instructions. The number of indirect branches retired."
+  },
+  {
+    "EventName": "ex_ret_cond",
+    "EventCode": "0xd1",
+    "BriefDescription": "Retired Conditional Branch Instructions."
+  },
+  {
+    "EventName": "ex_div_busy",
+    "EventCode": "0xd3",
+    "BriefDescription": "Div Cycles Busy count."
+  },
+  {
+    "EventName": "ex_div_count",
+    "EventCode": "0xd4",
+    "BriefDescription": "Div Op Count."
+  },
+  {
+    "EventName": "ex_ret_msprd_brnch_instr_dir_msmtch",
+    "EventCode": "0x1c7",
+    "BriefDescription": "Retired Mispredicted Branch Instructions due to Direction Mismatch",
+    "PublicDescription": "The number of retired conditional branch instructions that were not correctly predicted because of a branch direction mismatch."
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.ibs_count_rollover",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Tagged IBS Ops. Number of times an op could not be tagged by IBS because of a previous tagged op that has not retired.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops_ret",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS that retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ex_tagged_ibs_ops.ibs_tagged_ops",
+    "EventCode": "0x1cf",
+    "BriefDescription": "Tagged IBS Ops. Number of Ops tagged by IBS.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ex_ret_fused_instr",
+    "EventCode": "0x1d0",
+    "BriefDescription": "Counts retired Fused Instructions."
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json b/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json
new file mode 100644
index 000000000000..40271df40015
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/data-fabric.json
@@ -0,0 +1,98 @@
+[
+  {
+    "EventName": "remote_outbound_data_controller_0",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 0",
+    "EventCode": "0x7c7",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "remote_outbound_data_controller_1",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 1",
+    "EventCode": "0x807",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "remote_outbound_data_controller_2",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 2",
+    "EventCode": "0x847",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "remote_outbound_data_controller_3",
+    "PublicDescription": "Remote Link Controller Outbound Packet Types: Data (32B): Remote Link Controller 3",
+    "EventCode": "0x887",
+    "UMask": "0x02",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_0",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x07",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_1",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x47",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_2",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x87",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_3",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0xc7",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_4",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x107",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_5",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x147",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_6",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x187",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  },
+  {
+    "EventName": "dram_channel_data_controller_7",
+    "PublicDescription": "DRAM Channel Controller Request Types: Requests with Data (64B): DRAM Channel Controller 0",
+    "EventCode": "0x1c7",
+    "UMask": "0x38",
+    "PerPkg": "1",
+    "Unit": "DFPMC"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json b/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json
new file mode 100644
index 000000000000..98cfcb9c78ec
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/floating-point.json
@@ -0,0 +1,139 @@
+[
+  {
+    "EventName": "fpu_pipe_assignment.total",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number of fp uOps.",
+    "PublicDescription": "Total number of fp uOps. The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS.",
+    "UMask": "0x0f"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total3",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number uOps assigned to pipe 3.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one-cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 3.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total2",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number uOps assigned to pipe 2.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 2.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total1",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number uOps assigned to pipe 1.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 1.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fpu_pipe_assignment.total0",
+    "EventCode": "0x00",
+    "BriefDescription": "Total number of fp uOps  on pipe 0.",
+    "PublicDescription": "The number of operations (uOps) dispatched to each of the 4 FPU execution pipelines. This event reflects how busy the FPU pipelines are and may be used for workload characterization. This includes all operations performed by x87, MMX, and SSE instructions, including moves. Each increment represents a one- cycle dispatch event. This event is a speculative event. Since this event includes non-numeric operations it is not suitable for measuring MFLOPS. Total number uOps assigned to pipe 0.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.all",
+    "EventCode": "0x03",
+    "BriefDescription": "All FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPS. The number of events logged per cycle can vary from 0 to 64. This event can count above 15.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mac_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Multiply-Accumulate FLOPs. Each MAC operation is counted as 2 FLOPS. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.div_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Divide/square root FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.mult_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Multiply FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_ret_sse_avx_ops.add_sub_flops",
+    "EventCode": "0x03",
+    "BriefDescription": "Add/subtract FLOPs. This is a retire-based event. The number of retired SSE/AVX FLOPs. The number of events logged per cycle can vary from 0 to 64. This event requires the use of the MergeEvent since it can count above 15 events per cycle. See 2.1.17.3 [Large Increment per Cycle Events]. It does not provide a useful count without the use of the MergeEvent.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.optimized",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of Scalar Ops optimized. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.opt_potential",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of Ops that are candidates for optimization (have Z-bit either set or pass). This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops_elim",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of SSE Move Ops eliminated. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_num_mov_elim_scal_op.sse_mov_ops",
+    "EventCode": "0x04",
+    "BriefDescription": "Number of SSE Move Ops. This is a dispatch based speculative event, and is useful for measuring the effectiveness of the Move elimination and Scalar code optimization schemes.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.sse_bot_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "SSE/AVX bottom-executing ops retired. The number of serializing Ops retired.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.sse_ctrl_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "SSE/AVX control word mispredict traps. The number of serializing Ops retired.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.x87_bot_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "x87 bottom-executing ops retired. The number of serializing Ops retired.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_retired_ser_ops.x87_ctrl_ret",
+    "EventCode": "0x05",
+    "BriefDescription": "x87 control word mispredict traps due to mispredictions in RC or PC, or changes in mask bits. The number of serializing Ops retired.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_spill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. YMM spill fault.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "fp_disp_faults.ymm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. YMM fill fault.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "fp_disp_faults.xmm_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. XMM fill fault.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "fp_disp_faults.x87_fill_fault",
+    "EventCode": "0x0e",
+    "BriefDescription": "Floating Point Dispatch Faults. x87 fill fault.",
+    "UMask": "0x01"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/memory.json b/tools/perf/pmu-events/arch/x86/amdzen3/memory.json
new file mode 100644
index 000000000000..a2833955dcd2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/memory.json
@@ -0,0 +1,428 @@
+[
+  {
+    "EventName": "ls_bad_status2.stli_other",
+    "EventCode": "0x24",
+    "BriefDescription": "Non-forwardable conflict; used to reduce STLI's via software. All reasons. Store To Load Interlock (STLI) are loads that were unable to complete because of a possible match with an older store, and the older store could not do STLF for some reason.",
+    "PublicDescription" : "Store-to-load conflicts: A load was unable to complete due to a non-forwardable conflict with an older store. Most commonly, a load's address range partially but not completely overlaps with an uncompleted older store. Software can avoid this problem by using same-size and same-alignment loads and stores when accessing the same data. Vector/SIMD code is particularly susceptible to this problem; software should construct wide vector stores by manipulating vector elements in registers using shuffle/blend/swap instructions prior to storing to memory, instead of using narrow element-by-element stores.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_locks.spec_lock_hi_spec",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. High speculative cacheable lock speculation succeeded.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_locks.spec_lock_lo_spec",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. Low speculative cacheable lock speculation succeeded.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_locks.non_spec_lock",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. Non-speculative lock succeeded.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_locks.bus_lock",
+    "EventCode": "0x25",
+    "BriefDescription": "Retired lock instructions. Comparable to legacy bus lock.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_ret_cl_flush",
+    "EventCode": "0x26",
+    "BriefDescription": "The number of retired CLFLUSH instructions. This is a non-speculative event."
+  },
+  {
+    "EventName": "ls_ret_cpuid",
+    "EventCode": "0x27",
+    "BriefDescription": "The number of CPUID instructions retired."
+  },
+  {
+    "EventName": "ls_dispatch.ld_st_dispatch",
+    "EventCode": "0x29",
+    "BriefDescription": "Load-op-Store Dispatch. Dispatch of a single op that performs a load from and store to the same memory address. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dispatch.store_dispatch",
+    "EventCode": "0x29",
+    "BriefDescription": "Dispatch of a single op that performs a memory store. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dispatch.ld_dispatch",
+    "EventCode": "0x29",
+    "BriefDescription": "Dispatch of a single op that performs a memory load. Counts the number of operations dispatched to the LS unit. Unit Masks ADDed.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_smi_rx",
+    "EventCode": "0x2b",
+    "BriefDescription": "Counts the number of SMIs received."
+  },
+  {
+    "EventName": "ls_int_taken",
+    "EventCode": "0x2c",
+    "BriefDescription": "Counts the number of interrupts taken."
+  },
+  {
+    "EventName": "ls_rdtsc",
+    "EventCode": "0x2d",
+    "BriefDescription": "Number of reads of the TSC (RDTSC instructions). The count is speculative."
+  },
+  {
+    "EventName": "ls_stlf",
+    "EventCode": "0x35",
+    "BriefDescription": "Number of STLF hits."
+  },
+  {
+    "EventName": "ls_st_commit_cancel2.st_commit_cancel_wcb_full",
+    "EventCode": "0x37",
+    "BriefDescription": "A non-cacheable store and the non-cacheable commit buffer is full.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dc_accesses",
+    "EventCode": "0x40",
+    "BriefDescription": "Number of accesses to the dcache for load/store references.",
+    "PublicDescription": "The number of accesses to the data cache for load and store references. This may include certain microcode scratchpad accesses, although these are generally rare. Each increment represents an eight-byte access, although the instruction may only be accessing a portion of that. This event is a speculative event."
+  },
+  {
+    "EventName": "ls_mab_alloc.all_allocations",
+    "EventCode": "0x41",
+    "BriefDescription": "All Allocations. Counts when a LS pipe allocates a MAB entry.",
+    "UMask": "0x7f"
+  },
+  {
+    "EventName": "ls_mab_alloc.hardware_prefetcher_allocations",
+    "EventCode": "0x41",
+    "BriefDescription": "Hardware Prefetcher Allocations. Counts when a LS pipe allocates a MAB entry.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_mab_alloc.load_store_allocations",
+    "EventCode": "0x41",
+    "BriefDescription": "Load Store Allocations. Counts when a LS pipe allocates a MAB entry.",
+    "UMask": "0x3f"
+  },
+  {
+    "EventName": "ls_mab_alloc.dc_prefetcher",
+    "EventCode": "0x41",
+    "BriefDescription": "LS MAB Allocates by Type. DC prefetcher.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_mab_alloc.stores",
+    "EventCode": "0x41",
+    "BriefDescription": "LS MAB Allocates by Type. Stores.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_mab_alloc.loads",
+    "EventCode": "0x41",
+    "BriefDescription": "LS MAB Allocates by Type. Loads.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.mem_io_remote",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.ext_cache_remote",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.mem_io_local",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.ext_cache_local",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.int_cache",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_dmnd_fills_from_sys.lcl_l2",
+    "EventCode": "0x43",
+    "BriefDescription": "Demand Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.mem_io_remote",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.ext_cache_remote",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.mem_io_local",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.ext_cache_local",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.int_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_any_fills_from_sys.lcl_l2",
+    "EventCode": "0x44",
+    "BriefDescription": "Any Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.all",
+    "EventCode": "0x45",
+    "BriefDescription": "All L1 DTLB Misses or Reloads. Use l1_dtlb_misses instead.",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that also missed in the L2 TLB.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that also missed in the L2 TLB.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload coalesced page that also missed in the L2 TLB.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_miss",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that missed the L2 TLB.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_1g_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 1G page that hit in the L2 TLB.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_2m_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 2M page that hit in the L2 TLB.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_coalesced_page_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a coalesced page that hit in the L2 TLB.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_l1_d_tlb_miss.tlb_reload_4k_l2_hit",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Miss. DTLB reload to a 4K page that hit in the L2 TLB.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_tablewalker.iside",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks on I-side.",
+    "UMask": "0x0c"
+  },
+  {
+    "EventName": "ls_tablewalker.ic_type1",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks IC Type 1.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_tablewalker.ic_type0",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks IC Type 0.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_tablewalker.dside",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks on D-side.",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "ls_tablewalker.dc_type1",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks DC Type 1.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_tablewalker.dc_type0",
+    "EventCode": "0x46",
+    "BriefDescription": "Total Page Table Walks DC Type 0.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_misal_loads.ma4k",
+    "EventCode": "0x47",
+    "BriefDescription": "The number of 4KB misaligned (i.e., page crossing) loads.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_misal_loads.ma64",
+    "EventCode": "0x47",
+    "BriefDescription": "The number of 64B misaligned (i.e., cacheline crossing) loads.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_pref_instr_disp",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative).",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_nta",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchNTA instruction. See docAPM3 PREFETCHlevel.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch_w",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchW instruction. See docAPM3 PREFETCHW.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_pref_instr_disp.prefetch",
+    "EventCode": "0x4b",
+    "BriefDescription": "Software Prefetch Instructions Dispatched (Speculative). PrefetchT0, T1 and T2 instructions. See docAPM3 PREFETCHlevel.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.mab_mch_cnt",
+    "EventCode": "0x52",
+    "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a match on an already-allocated miss request buffer.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_inef_sw_pref.data_pipe_sw_pf_dc_hit",
+    "EventCode": "0x52",
+    "BriefDescription": "The number of software prefetches that did not fetch data outside of the processor core. Software PREFETCH instruction saw a DC hit.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.mem_io_remote",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.ext_cache_remote",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.mem_io_local",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.ext_cache_local",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.int_cache",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_sw_pf_dc_fills.lcl_l2",
+    "EventCode": "0x59",
+    "BriefDescription": "Software Prefetch Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.mem_io_remote",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in different Node.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.ext_cache_remote",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From CCX Cache in different Node.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.mem_io_local",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From DRAM or IO connected in same node.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.ext_cache_local",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From cache of different CCX in same node.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.int_cache",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From L3 or different L2 in same CCX.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "ls_hw_pf_dc_fills.lcl_l2",
+    "EventCode": "0x5a",
+    "BriefDescription": "Hardware Prefetch Data Cache Fills by Data Source. From Local L2 to the core.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "ls_alloc_mab_count",
+    "EventCode": "0x5f",
+    "BriefDescription": "Count of Allocated Mabs",
+    "PublicDescription": "This event counts the in-flight L1 data cache misses (allocated Miss Address Buffers) divided by 4 and rounded down each cycle unless used with the MergeEvent functionality. If the MergeEvent is used, it counts the exact number of outstanding L1 data cache misses. See 2.1.17.3 [Large Increment per Cycle Events]."
+  },
+  {
+    "EventName": "ls_not_halted_cyc",
+    "EventCode": "0x76",
+    "BriefDescription": "Cycles not in Halt."
+  },
+  {
+    "EventName": "ls_tlb_flush.all_tlb_flushes",
+    "EventCode": "0x78",
+    "BriefDescription": "All TLB Flushes. Requires unit mask 0xFF to engage event for counting. Use all_tlbs_flushed instead",
+    "UMask": "0xff"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/other.json b/tools/perf/pmu-events/arch/x86/amdzen3/other.json
new file mode 100644
index 000000000000..7da5d0791ea3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/other.json
@@ -0,0 +1,103 @@
+[
+  {
+    "EventName": "de_dis_uop_queue_empty_di0",
+    "EventCode": "0xa9",
+    "BriefDescription": "Cycles where the Micro-Op Queue is empty."
+  },
+  {
+    "EventName": "de_dis_cops_from_decoder.disp_op_type.any_integer_dispatch",
+    "EventCode": "0xab",
+    "BriefDescription": "Any Integer dispatch. Types of Oops Dispatched from Decoder.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dis_cops_from_decoder.disp_op_type.any_fp_dispatch",
+    "EventCode": "0xab",
+    "BriefDescription": "Any FP dispatch. Types of Oops Dispatched from Decoder.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.fp_flush_recovery_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. FP Flush recovery stall.",
+    "UMask": "0x80"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.fp_sch_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. FP scheduler resource stall. Applies to ops that use the FP scheduler.",
+    "UMask": "0x40"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.fp_reg_file_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Floating point register file resource stall. Applies to all FP ops that have a destination register.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.taken_brnch_buffer_rsrc",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Taken branch buffer resource stall.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.int_sched_misc_token_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Integer Scheduler miscellaneous resource stall.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.store_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Store Queue resource stall. Applies to all ops with store semantics.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.load_queue_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Load Queue resource stall. Applies to all ops with load semantics.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls1.int_phy_reg_file_rsrc_stall",
+    "EventCode": "0xae",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a Token Stall. Also counts cycles when the thread is not selected to dispatch but would have been stalled due to a Token Stall. Integer Physical Register File resource stall. Integer Physical Register File, applies to all ops that have an integer destination register.",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.retire_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. Insufficient Retire Queue tokens available.",
+    "UMask": "0x20"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.agsq_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. AGSQ Tokens unavailable.",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch3_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 3 available.",
+    "UMask": "0x08"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch2_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 2 available.",
+    "UMask": "0x04"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch1_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 1 available.",
+    "UMask": "0x02"
+  },
+  {
+    "EventName": "de_dis_dispatch_token_stalls2.int_sch0_token_stall",
+    "EventCode": "0xaf",
+    "BriefDescription": "Cycles where a dispatch group is valid but does not get dispatched due to a token stall. No tokens for Integer Scheduler Queue 0 available.",
+    "UMask": "0x01"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json
new file mode 100644
index 000000000000..988cf68ae825
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json
@@ -0,0 +1,214 @@
+[
+  {
+    "MetricName": "branch_misprediction_ratio",
+    "BriefDescription": "Execution-Time Branch Misprediction Ratio (Non-Speculative)",
+    "MetricExpr": "d_ratio(ex_ret_brn_misp, ex_ret_brn)",
+    "MetricGroup": "branch_prediction",
+    "ScaleUnit": "100%"
+  },
+  {
+    "EventName": "all_data_cache_accesses",
+    "EventCode": "0x29",
+    "BriefDescription": "All L1 Data Cache Accesses",
+    "UMask": "0x07"
+  },
+  {
+    "MetricName": "all_l2_cache_accesses",
+    "BriefDescription": "All L2 Cache Accesses",
+    "MetricExpr": "l2_request_g1.all_no_prefetch + l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "EventName": "l2_cache_accesses_from_ic_misses",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 Cache Accesses from L1 Instruction Cache Misses (including prefetch)",
+    "UMask": "0x10"
+  },
+  {
+    "EventName": "l2_cache_accesses_from_dc_misses",
+    "EventCode": "0x60",
+    "BriefDescription": "L2 Cache Accesses from L1 Data Cache Misses (including prefetch)",
+    "UMask": "0xe8"
+  },
+  {
+    "MetricName": "l2_cache_accesses_from_l2_hwpf",
+    "BriefDescription": "L2 Cache Accesses from L2 HWPF",
+    "MetricExpr": "l2_pf_hit_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "MetricName": "all_l2_cache_misses",
+    "BriefDescription": "All L2 Cache Misses",
+    "MetricExpr": "l2_cache_req_stat.ic_dc_miss_in_l2 + l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "EventName": "l2_cache_misses_from_ic_miss",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Misses from L1 Instruction Cache Misses",
+    "UMask": "0x01"
+  },
+  {
+    "EventName": "l2_cache_misses_from_dc_misses",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Misses from L1 Data Cache Misses",
+    "UMask": "0x08"
+  },
+  {
+    "MetricName": "l2_cache_misses_from_l2_hwpf",
+    "BriefDescription": "L2 Cache Misses from L2 Cache HWPF",
+    "MetricExpr": "l2_pf_miss_l2_hit_l3 + l2_pf_miss_l2_l3",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "MetricName": "all_l2_cache_hits",
+    "BriefDescription": "All L2 Cache Hits",
+    "MetricExpr": "l2_cache_req_stat.ic_dc_hit_in_l2 + l2_pf_hit_l2",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "EventName": "l2_cache_hits_from_ic_misses",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Hits from L1 Instruction Cache Misses",
+    "UMask": "0x06"
+  },
+  {
+    "EventName": "l2_cache_hits_from_dc_misses",
+    "EventCode": "0x64",
+    "BriefDescription": "L2 Cache Hits from L1 Data Cache Misses",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "l2_cache_hits_from_l2_hwpf",
+    "EventCode": "0x70",
+    "BriefDescription": "L2 Cache Hits from L2 Cache HWPF",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l3_cache_accesses",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 Cache Accesses",
+    "UMask": "0xff",
+    "Unit": "L3PMC"
+  },
+  {
+    "EventName": "l3_misses",
+    "EventCode": "0x04",
+    "BriefDescription": "L3 Misses (includes cacheline state change requests)",
+    "UMask": "0x01",
+    "Unit": "L3PMC"
+  },
+  {
+    "MetricName": "l3_read_miss_latency",
+    "BriefDescription": "Average L3 Read Miss Latency (in core clocks)",
+    "MetricExpr": "(xi_sys_fill_latency * 16) / xi_ccx_sdp_req1",
+    "MetricGroup": "l3_cache",
+    "ScaleUnit": "1core clocks"
+  },
+  {
+    "MetricName": "op_cache_fetch_miss_ratio",
+    "BriefDescription": "Op Cache (64B) Fetch Miss Ratio",
+    "MetricExpr": "d_ratio(op_cache_hit_miss.op_cache_miss, op_cache_hit_miss.all_op_cache_accesses)",
+    "MetricGroup": "l2_cache"
+  },
+  {
+    "MetricName": "ic_fetch_miss_ratio",
+    "BriefDescription": "Instruction Cache (32B) Fetch Miss Ratio",
+    "MetricExpr": "d_ratio(ic_tag_hit_miss.instruction_cache_miss, ic_tag_hit_miss.all_instruction_cache_accesses)",
+    "MetricGroup": "l2_cache",
+    "ScaleUnit": "100%"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_memory",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From Memory",
+    "UMask": "0x48"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_remote_node",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From Remote Node",
+    "UMask": "0x50"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_within_same_ccx",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From within same CCX",
+    "UMask": "0x03"
+  },
+  {
+    "EventName": "l1_data_cache_fills_from_external_ccx_cache",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: From External CCX Cache",
+    "UMask": "0x14"
+  },
+  {
+    "EventName": "l1_data_cache_fills_all",
+    "EventCode": "0x44",
+    "BriefDescription": "L1 Data Cache Fills: All",
+    "UMask": "0xff"
+  },
+  {
+    "MetricName": "l1_itlb_misses",
+    "BriefDescription": "L1 ITLB Misses",
+    "MetricExpr": "bp_l1_tlb_miss_l2_tlb_hit + bp_l1_tlb_miss_l2_tlb_miss",
+    "MetricGroup": "tlb"
+  },
+  {
+    "EventName": "l2_itlb_misses",
+    "EventCode": "0x85",
+    "BriefDescription": "L2 ITLB Misses & Instruction page walks",
+    "UMask": "0x07"
+  },
+  {
+    "EventName": "l1_dtlb_misses",
+    "EventCode": "0x45",
+    "BriefDescription": "L1 DTLB Misses",
+    "UMask": "0xff"
+  },
+  {
+    "EventName": "l2_dtlb_misses",
+    "EventCode": "0x45",
+    "BriefDescription": "L2 DTLB Misses & Data page walks",
+    "UMask": "0xf0"
+  },
+  {
+    "EventName": "all_tlbs_flushed",
+    "EventCode": "0x78",
+    "BriefDescription": "All TLBs Flushed",
+    "UMask": "0xff"
+  },
+  {
+    "MetricName": "macro_ops_dispatched",
+    "BriefDescription": "Macro-ops Dispatched",
+    "MetricExpr": "de_dis_cops_from_decoder.disp_op_type.any_integer_dispatch + de_dis_cops_from_decoder.disp_op_type.any_fp_dispatch",
+    "MetricGroup": "decoder"
+  },
+  {
+    "EventName": "sse_avx_stalls",
+    "EventCode": "0x0e",
+    "BriefDescription": "Mixed SSE/AVX Stalls",
+    "UMask": "0x0e"
+  },
+  {
+    "EventName": "macro_ops_retired",
+    "EventCode": "0xc1",
+    "BriefDescription": "Macro-ops Retired"
+  },
+  {
+    "MetricName": "all_remote_links_outbound",
+    "BriefDescription": "Approximate: Outbound data bytes for all Remote Links for a node (die)",
+    "MetricExpr": "remote_outbound_data_controller_0 + remote_outbound_data_controller_1 + remote_outbound_data_controller_2 + remote_outbound_data_controller_3",
+    "MetricGroup": "data_fabric",
+    "PerPkg": "1",
+    "ScaleUnit": "3e-5MiB"
+  },
+  {
+    "MetricName": "nps1_die_to_dram",
+    "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die) (may need --metric-no-group)",
+    "MetricExpr": "dram_channel_data_controller_0 + dram_channel_data_controller_1 + dram_channel_data_controller_2 + dram_channel_data_controller_3 + dram_channel_data_controller_4 + dram_channel_data_controller_5 + dram_channel_data_controller_6 + dram_channel_data_controller_7",
+    "MetricGroup": "data_fabric",
+    "PerPkg": "1",
+    "ScaleUnit": "6.1e-5MiB"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 2f2a209e87e1..0a6a8c7f937f 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -24,6 +24,7 @@ GenuineIntel-6-1F,v2,nehalemep,core
 GenuineIntel-6-1A,v2,nehalemep,core
 GenuineIntel-6-2E,v2,nehalemex,core
 GenuineIntel-6-[4589]E,v24,skylake,core
+GenuineIntel-6-A[56],v24,skylake,core
 GenuineIntel-6-37,v13,silvermont,core
 GenuineIntel-6-4D,v13,silvermont,core
 GenuineIntel-6-4C,v13,silvermont,core
@@ -35,7 +36,9 @@ GenuineIntel-6-55-[01234],v1,skylakex,core
 GenuineIntel-6-55-[56789ABCDEF],v1,cascadelakex,core
 GenuineIntel-6-7D,v1,icelake,core
 GenuineIntel-6-7E,v1,icelake,core
+GenuineIntel-6-8[CD],v1,icelake,core
+GenuineIntel-6-A7,v1,icelake,core
 GenuineIntel-6-86,v1,tremontx,core
 AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),v2,amdzen1,core
 AuthenticAMD-23-[[:xdigit:]]+,v1,amdzen2,core
-AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen2,core
+AuthenticAMD-25-[[:xdigit:]]+,v1,amdzen3,core
diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index e1f3f5c8c550..ed4f0bd72e5a 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -285,6 +285,8 @@ static struct map {
 	{ "imx8_ddr", "imx8_ddr" },
 	{ "L3PMC", "amd_l3" },
 	{ "DFPMC", "amd_df" },
+	{ "cpu_core", "cpu_core" },
+	{ "cpu_atom", "cpu_atom" },
 	{}
 };
 
@@ -1149,7 +1151,7 @@ static int process_one_file(const char *fpath, const struct stat *sb,
 	 * and directory tree could result in build failure due to table
 	 * names not being found.
 	 *
-	 * Atleast for now, be strict with processing JSON file names.
+	 * At least for now, be strict with processing JSON file names.
 	 * i.e. if JSON file name cannot be mapped to C-style table name,
 	 * fail.
 	 */
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py
index ea0c8b90a783..a0cfc7fe5908 100644
--- a/tools/perf/scripts/python/netdev-times.py
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -356,7 +356,7 @@ def handle_irq_softirq_exit(event_info):
 		return
 	rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time,
 			'irq_list':irq_list, 'event_list':event_list}
-	# merge information realted to a NET_RX softirq
+	# merge information related to a NET_RX softirq
 	receive_hunk_list.append(rec_data)
 
 def handle_napi_poll(event_info):
diff --git a/tools/perf/tests/attr.c b/tools/perf/tests/attr.c
index dd39ce9b0277..9b40a25376ae 100644
--- a/tools/perf/tests/attr.c
+++ b/tools/perf/tests/attr.c
@@ -34,6 +34,7 @@
 #include "event.h"
 #include "util.h"
 #include "tests.h"
+#include "pmu.h"
 
 #define ENV "PERF_TEST_ATTR"
 
@@ -184,6 +185,9 @@ int test__attr(struct test *test __maybe_unused, int subtest __maybe_unused)
 	char path_dir[PATH_MAX];
 	char *exec_path;
 
+	if (perf_pmu__has_hybrid())
+		return TEST_SKIP;
+
 	/* First try development tree tests. */
 	if (!lstat("./tests", &st))
 		return run_dir("./tests", "./perf");
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index cc9fbcedb364..ef37353636d8 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -225,11 +225,11 @@ int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused
 	 *
 	 * The test case check following error conditions:
 	 * - we get stuck in signal handler because of debug
-	 *   exception being triggered receursively due to
+	 *   exception being triggered recursively due to
 	 *   the wrong RF EFLAG management
 	 *
 	 * - we never trigger the sig_handler breakpoint due
-	 *   to the rong RF EFLAG management
+	 *   to the wrong RF EFLAG management
 	 *
 	 */
 
@@ -242,7 +242,7 @@ int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused
 	ioctl(fd3, PERF_EVENT_IOC_ENABLE, 0);
 
 	/*
-	 * Kick off the test by trigering 'fd1'
+	 * Kick off the test by triggering 'fd1'
 	 * breakpoint.
 	 */
 	test_function();
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 2fdc7b2f996e..9866cddebf23 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -658,7 +658,7 @@ static int do_test_code_reading(bool try_kcore)
 				/*
 				 * Both cpus and threads are now owned by evlist
 				 * and will be freed by following perf_evlist__set_maps
-				 * call. Getting refference to keep them alive.
+				 * call. Getting reference to keep them alive.
 				 */
 				perf_cpu_map__get(cpus);
 				perf_thread_map__get(threads);
diff --git a/tools/perf/tests/demangle-ocaml-test.c b/tools/perf/tests/demangle-ocaml-test.c
index a273ed5163d7..0043be812355 100644
--- a/tools/perf/tests/demangle-ocaml-test.c
+++ b/tools/perf/tests/demangle-ocaml-test.c
@@ -19,14 +19,14 @@ int test__demangle_ocaml(struct test *test __maybe_unused, int subtest __maybe_u
 		{ "main",
 		  NULL },
 		{ "camlStdlib__array__map_154",
-		  "Stdlib.array.map" },
+		  "Stdlib.array.map_154" },
 		{ "camlStdlib__anon_fn$5bstdlib$2eml$3a334$2c0$2d$2d54$5d_1453",
-		  "Stdlib.anon_fn[stdlib.ml:334,0--54]" },
+		  "Stdlib.anon_fn[stdlib.ml:334,0--54]_1453" },
 		{ "camlStdlib__bytes__$2b$2b_2205",
-		  "Stdlib.bytes.++" },
+		  "Stdlib.bytes.++_2205" },
 	};
 
-	for (i = 0; i < sizeof(test_cases) / sizeof(test_cases[0]); i++) {
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
 		buf = ocaml_demangle_sym(test_cases[i].mangled);
 		if ((buf == NULL && test_cases[i].demangled != NULL)
 				|| (buf != NULL && test_cases[i].demangled == NULL)
diff --git a/tools/perf/tests/evsel-roundtrip-name.c b/tools/perf/tests/evsel-roundtrip-name.c
index f7f3e5b4c180..b74cf80d1f10 100644
--- a/tools/perf/tests/evsel-roundtrip-name.c
+++ b/tools/perf/tests/evsel-roundtrip-name.c
@@ -4,6 +4,7 @@
 #include "parse-events.h"
 #include "tests.h"
 #include "debug.h"
+#include "pmu.h"
 #include <errno.h>
 #include <linux/kernel.h>
 
@@ -62,7 +63,8 @@ static int perf_evsel__roundtrip_cache_name_test(void)
 	return ret;
 }
 
-static int __perf_evsel__name_array_test(const char *names[], int nr_names)
+static int __perf_evsel__name_array_test(const char *names[], int nr_names,
+					 int distance)
 {
 	int i, err;
 	struct evsel *evsel;
@@ -82,9 +84,9 @@ static int __perf_evsel__name_array_test(const char *names[], int nr_names)
 
 	err = 0;
 	evlist__for_each_entry(evlist, evsel) {
-		if (strcmp(evsel__name(evsel), names[evsel->idx])) {
+		if (strcmp(evsel__name(evsel), names[evsel->idx / distance])) {
 			--err;
-			pr_debug("%s != %s\n", evsel__name(evsel), names[evsel->idx]);
+			pr_debug("%s != %s\n", evsel__name(evsel), names[evsel->idx / distance]);
 		}
 	}
 
@@ -93,18 +95,21 @@ out_delete_evlist:
 	return err;
 }
 
-#define perf_evsel__name_array_test(names) \
-	__perf_evsel__name_array_test(names, ARRAY_SIZE(names))
+#define perf_evsel__name_array_test(names, distance) \
+	__perf_evsel__name_array_test(names, ARRAY_SIZE(names), distance)
 
 int test__perf_evsel__roundtrip_name_test(struct test *test __maybe_unused, int subtest __maybe_unused)
 {
 	int err = 0, ret = 0;
 
-	err = perf_evsel__name_array_test(evsel__hw_names);
+	if (perf_pmu__has_hybrid())
+		return perf_evsel__name_array_test(evsel__hw_names, 2);
+
+	err = perf_evsel__name_array_test(evsel__hw_names, 1);
 	if (err)
 		ret = err;
 
-	err = __perf_evsel__name_array_test(evsel__sw_names, PERF_COUNT_SW_DUMMY + 1);
+	err = __perf_evsel__name_array_test(evsel__sw_names, PERF_COUNT_SW_DUMMY + 1, 1);
 	if (err)
 		ret = err;
 
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index 3f2e1a581247..890cb1f5bf53 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -47,7 +47,7 @@ static struct sample fake_samples[] = {
 };
 
 /*
- * Will be casted to struct ip_callchain which has all 64 bit entries
+ * Will be cast to struct ip_callchain which has all 64 bit entries
  * of nr and ips[].
  */
 static u64 fake_callchains[][10] = {
@@ -297,7 +297,7 @@ out:
 	return err;
 }
 
-/* callcain + NO children */
+/* callchain + NO children */
 static int test2(struct evsel *evsel, struct machine *machine)
 {
 	int err;
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index 123e07d35b55..ca6120cd1d90 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -150,13 +150,13 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 		}
 
 		TEST_ASSERT_VAL("Invalid nr samples",
-				hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+				hists->stats.nr_samples == 10);
 		TEST_ASSERT_VAL("Invalid nr hist entries",
 				hists->nr_entries == 9);
 		TEST_ASSERT_VAL("Invalid total period",
 				hists->stats.total_period == 1000);
 		TEST_ASSERT_VAL("Unmatched nr samples",
-				hists->stats.nr_events[PERF_RECORD_SAMPLE] ==
+				hists->stats.nr_samples ==
 				hists->stats.nr_non_filtered_samples);
 		TEST_ASSERT_VAL("Unmatched nr hist entries",
 				hists->nr_entries == hists->nr_non_filtered_entries);
@@ -175,7 +175,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
 		/* normal stats should be invariant */
 		TEST_ASSERT_VAL("Invalid nr samples",
-				hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+				hists->stats.nr_samples == 10);
 		TEST_ASSERT_VAL("Invalid nr hist entries",
 				hists->nr_entries == 9);
 		TEST_ASSERT_VAL("Invalid total period",
@@ -204,7 +204,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
 		/* normal stats should be invariant */
 		TEST_ASSERT_VAL("Invalid nr samples",
-				hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+				hists->stats.nr_samples == 10);
 		TEST_ASSERT_VAL("Invalid nr hist entries",
 				hists->nr_entries == 9);
 		TEST_ASSERT_VAL("Invalid total period",
@@ -239,7 +239,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
 		/* normal stats should be invariant */
 		TEST_ASSERT_VAL("Invalid nr samples",
-				hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+				hists->stats.nr_samples == 10);
 		TEST_ASSERT_VAL("Invalid nr hist entries",
 				hists->nr_entries == 9);
 		TEST_ASSERT_VAL("Invalid total period",
@@ -268,7 +268,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
 		/* normal stats should be invariant */
 		TEST_ASSERT_VAL("Invalid nr samples",
-				hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+				hists->stats.nr_samples == 10);
 		TEST_ASSERT_VAL("Invalid nr hist entries",
 				hists->nr_entries == 9);
 		TEST_ASSERT_VAL("Invalid total period",
@@ -299,7 +299,7 @@ int test__hists_filter(struct test *test __maybe_unused, int subtest __maybe_unu
 
 		/* normal stats should be invariant */
 		TEST_ASSERT_VAL("Invalid nr samples",
-				hists->stats.nr_events[PERF_RECORD_SAMPLE] == 10);
+				hists->stats.nr_samples == 10);
 		TEST_ASSERT_VAL("Invalid nr hist entries",
 				hists->nr_entries == 9);
 		TEST_ASSERT_VAL("Invalid total period",
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index a90fa043c066..94bd5d215d94 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -155,7 +155,6 @@ run += make_no_syscall_tbl
 run += make_with_babeltrace
 run += make_with_clangllvm
 run += make_with_libpfm4
-run += make_with_gtk2
 run += make_help
 run += make_doc
 run += make_perf_o
@@ -172,7 +171,6 @@ run += make_install_prefix_slash
 # run += make_install_info
 # run += make_install_pdf
 run += make_minimal
-run += make_static
 
 ifneq ($(call has,ctags),)
 run += make_tags
@@ -307,6 +305,26 @@ $(run):
 	$(call test,$@) && \
 	rm -rf $@ $$TMP_DEST || (cat $@ ; false)
 
+make_with_gtk2:
+	$(call clean)
+	@TMP_DEST=$$(mktemp -d); \
+	cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \
+	printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
+	( eval $$cmd ) >> $@ 2>&1; \
+	echo "  test: $(call test,$@)" >> $@ 2>&1; \
+	$(call test,$@) && \
+	rm -rf $@ $$TMP_DEST || (cat $@ ; false)
+
+make_static:
+	$(call clean)
+	@TMP_DEST=$$(mktemp -d); \
+	cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \
+	printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
+	( eval $$cmd ) >> $@ 2>&1; \
+	echo "  test: $(call test,$@)" >> $@ 2>&1; \
+	$(call test,$@) && \
+	rm -rf $@ $$TMP_DEST || (cat $@ ; false)
+
 $(run_O):
 	$(call clean)
 	@TMP_O=$$(mktemp -d); \
diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index a7f6661e6112..0f113b2b36a3 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -20,7 +20,7 @@
 
 #if defined(__s390x__)
 /* Return true if kvm module is available and loaded. Test this
- * and retun success when trace point kvm_s390_create_vm
+ * and return success when trace point kvm_s390_create_vm
  * exists. Otherwise this test always fails.
  */
 static bool kvm_s390_create_vm_valid(void)
@@ -1512,6 +1512,124 @@ static int test__all_tracepoints(struct evlist *evlist)
 	return test__checkevent_tracepoint_multi(evlist);
 }
 
+static int test__hybrid_hw_event_with_pmu(struct evlist *evlist)
+{
+	struct evsel *evsel = evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+	return 0;
+}
+
+static int test__hybrid_hw_group_event(struct evlist *evlist)
+{
+	struct evsel *evsel, *leader;
+
+	evsel = leader = evlist__first(evlist);
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+	evsel = evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0xc0 == evsel->core.attr.config);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	return 0;
+}
+
+static int test__hybrid_sw_hw_group_event(struct evlist *evlist)
+{
+	struct evsel *evsel, *leader;
+
+	evsel = leader = evlist__first(evlist);
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+	evsel = evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	return 0;
+}
+
+static int test__hybrid_hw_sw_group_event(struct evlist *evlist)
+{
+	struct evsel *evsel, *leader;
+
+	evsel = leader = evlist__first(evlist);
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+
+	evsel = evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_SOFTWARE == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	return 0;
+}
+
+static int test__hybrid_group_modifier1(struct evlist *evlist)
+{
+	struct evsel *evsel, *leader;
+
+	evsel = leader = evlist__first(evlist);
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0x3c == evsel->core.attr.config);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong exclude_user", evsel->core.attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->core.attr.exclude_kernel);
+
+	evsel = evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0xc0 == evsel->core.attr.config);
+	TEST_ASSERT_VAL("wrong leader", evsel->leader == leader);
+	TEST_ASSERT_VAL("wrong exclude_user", !evsel->core.attr.exclude_user);
+	TEST_ASSERT_VAL("wrong exclude_kernel", evsel->core.attr.exclude_kernel);
+	return 0;
+}
+
+static int test__hybrid_raw1(struct evlist *evlist)
+{
+	struct evsel *evsel = evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config);
+
+	/* The type of second event is randome value */
+	evsel = evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config);
+	return 0;
+}
+
+static int test__hybrid_raw2(struct evlist *evlist)
+{
+	struct evsel *evsel = evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong number of entries", 1 == evlist->core.nr_entries);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_RAW == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0x1a == evsel->core.attr.config);
+	return 0;
+}
+
+static int test__hybrid_cache_event(struct evlist *evlist)
+{
+	struct evsel *evsel = evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong number of entries", 2 == evlist->core.nr_entries);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0x2 == (evsel->core.attr.config & 0xffffffff));
+
+	evsel = evsel__next(evsel);
+	TEST_ASSERT_VAL("wrong type", PERF_TYPE_HW_CACHE == evsel->core.attr.type);
+	TEST_ASSERT_VAL("wrong config", 0x10002 == (evsel->core.attr.config & 0xffffffff));
+	return 0;
+}
+
 struct evlist_test {
 	const char *name;
 	__u32 type;
@@ -1868,6 +1986,54 @@ static struct terms_test test__terms[] = {
 	},
 };
 
+static struct evlist_test test__hybrid_events[] = {
+	{
+		.name  = "cpu_core/cpu-cycles/",
+		.check = test__hybrid_hw_event_with_pmu,
+		.id    = 0,
+	},
+	{
+		.name  = "{cpu_core/cpu-cycles/,cpu_core/instructions/}",
+		.check = test__hybrid_hw_group_event,
+		.id    = 1,
+	},
+	{
+		.name  = "{cpu-clock,cpu_core/cpu-cycles/}",
+		.check = test__hybrid_sw_hw_group_event,
+		.id    = 2,
+	},
+	{
+		.name  = "{cpu_core/cpu-cycles/,cpu-clock}",
+		.check = test__hybrid_hw_sw_group_event,
+		.id    = 3,
+	},
+	{
+		.name  = "{cpu_core/cpu-cycles/k,cpu_core/instructions/u}",
+		.check = test__hybrid_group_modifier1,
+		.id    = 4,
+	},
+	{
+		.name  = "r1a",
+		.check = test__hybrid_raw1,
+		.id    = 5,
+	},
+	{
+		.name  = "cpu_core/r1a/",
+		.check = test__hybrid_raw2,
+		.id    = 6,
+	},
+	{
+		.name  = "cpu_core/config=10,config1,config2=3,period=1000/u",
+		.check = test__checkevent_pmu,
+		.id    = 7,
+	},
+	{
+		.name  = "cpu_core/LLC-loads/,cpu_atom/LLC-load-misses/",
+		.check = test__hybrid_cache_event,
+		.id    = 8,
+	},
+};
+
 static int test_event(struct evlist_test *e)
 {
 	struct parse_events_error err;
@@ -2035,6 +2201,11 @@ do {							\
 		ret2 = ret1;				\
 } while (0)
 
+	if (perf_pmu__has_hybrid()) {
+		TEST_EVENTS(test__hybrid_events);
+		return ret2;
+	}
+
 	TEST_EVENTS(test__events);
 
 	if (test_pmu())
diff --git a/tools/perf/tests/parse-metric.c b/tools/perf/tests/parse-metric.c
index 6dc1db1626ad..4f6f4904e852 100644
--- a/tools/perf/tests/parse-metric.c
+++ b/tools/perf/tests/parse-metric.c
@@ -11,6 +11,7 @@
 #include "debug.h"
 #include "expr.h"
 #include "stat.h"
+#include "pmu.h"
 
 static struct pmu_event pme_test[] = {
 {
@@ -98,7 +99,7 @@ static u64 find_value(const char *name, struct value *values)
 		if (!strcmp(name, v->event))
 			return v->val;
 		v++;
-	};
+	}
 	return 0;
 }
 
@@ -186,7 +187,7 @@ static int __compute_metric(const char *name, struct value *vals,
 		*ratio2 = compute_single(&metric_events, evlist, &st, name2);
 
 out:
-	/* ... clenup. */
+	/* ... cleanup. */
 	metricgroup__rblist_exit(&metric_events);
 	runtime_stat__exit(&st);
 	evlist__free_stats(evlist);
@@ -372,10 +373,13 @@ int test__parse_metric(struct test *test __maybe_unused, int subtest __maybe_unu
 {
 	TEST_ASSERT_VAL("IPC failed", test_ipc() == 0);
 	TEST_ASSERT_VAL("frontend failed", test_frontend() == 0);
-	TEST_ASSERT_VAL("cache_miss_cycles failed", test_cache_miss_cycles() == 0);
 	TEST_ASSERT_VAL("DCache_L2 failed", test_dcache_l2() == 0);
 	TEST_ASSERT_VAL("recursion fail failed", test_recursion_fail() == 0);
-	TEST_ASSERT_VAL("test metric group", test_metric_group() == 0);
 	TEST_ASSERT_VAL("Memory bandwidth", test_memory_bandwidth() == 0);
+
+	if (!perf_pmu__has_hybrid()) {
+		TEST_ASSERT_VAL("cache_miss_cycles failed", test_cache_miss_cycles() == 0);
+		TEST_ASSERT_VAL("test metric group", test_metric_group() == 0);
+	}
 	return 0;
 }
diff --git a/tools/perf/tests/perf-time-to-tsc.c b/tools/perf/tests/perf-time-to-tsc.c
index 680c3cffb128..85d75b9b25a1 100644
--- a/tools/perf/tests/perf-time-to-tsc.c
+++ b/tools/perf/tests/perf-time-to-tsc.c
@@ -20,6 +20,7 @@
 #include "tsc.h"
 #include "mmap.h"
 #include "tests.h"
+#include "pmu.h"
 
 #define CHECK__(x) {				\
 	while ((x) < 0) {			\
@@ -88,6 +89,17 @@ int test__perf_time_to_tsc(struct test *test __maybe_unused, int subtest __maybe
 	evsel->core.attr.disabled = 1;
 	evsel->core.attr.enable_on_exec = 0;
 
+	/*
+	 * For hybrid "cycles:u", it creates two events.
+	 * Init the second evsel here.
+	 */
+	if (perf_pmu__has_hybrid()) {
+		evsel = evsel__next(evsel);
+		evsel->core.attr.comm = 1;
+		evsel->core.attr.disabled = 1;
+		evsel->core.attr.enable_on_exec = 0;
+	}
+
 	CHECK__(evlist__open(evlist));
 
 	CHECK__(evlist__mmap(evlist, UINT_MAX));
diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c
index 0ca6a5a53523..b8aff8fb50d8 100644
--- a/tools/perf/tests/pmu-events.c
+++ b/tools/perf/tests/pmu-events.c
@@ -12,6 +12,7 @@
 #include "util/evlist.h"
 #include "util/expr.h"
 #include "util/parse-events.h"
+#include "metricgroup.h"
 
 struct perf_pmu_test_event {
 	/* used for matching against events from generated pmu-events.c */
@@ -471,9 +472,74 @@ static void expr_failure(const char *msg,
 	pr_debug("On expression %s\n", pe->metric_expr);
 }
 
+struct metric {
+	struct list_head list;
+	struct metric_ref metric_ref;
+};
+
+static int resolve_metric_simple(struct expr_parse_ctx *pctx,
+				 struct list_head *compound_list,
+				 struct pmu_events_map *map,
+				 const char *metric_name)
+{
+	struct hashmap_entry *cur, *cur_tmp;
+	struct metric *metric, *tmp;
+	size_t bkt;
+	bool all;
+	int rc;
+
+	do {
+		all = true;
+		hashmap__for_each_entry_safe((&pctx->ids), cur, cur_tmp, bkt) {
+			struct metric_ref *ref;
+			struct pmu_event *pe;
+
+			pe = metricgroup__find_metric(cur->key, map);
+			if (!pe)
+				continue;
+
+			if (!strcmp(metric_name, (char *)cur->key)) {
+				pr_warning("Recursion detected for metric %s\n", metric_name);
+				rc = -1;
+				goto out_err;
+			}
+
+			all = false;
+
+			/* The metric key itself needs to go out.. */
+			expr__del_id(pctx, cur->key);
+
+			metric = malloc(sizeof(*metric));
+			if (!metric) {
+				rc = -ENOMEM;
+				goto out_err;
+			}
+
+			ref = &metric->metric_ref;
+			ref->metric_name = pe->metric_name;
+			ref->metric_expr = pe->metric_expr;
+			list_add_tail(&metric->list, compound_list);
+
+			rc = expr__find_other(pe->metric_expr, NULL, pctx, 0);
+			if (rc)
+				goto out_err;
+			break; /* The hashmap has been modified, so restart */
+		}
+	} while (!all);
+
+	return 0;
+
+out_err:
+	list_for_each_entry_safe(metric, tmp, compound_list, list)
+		free(metric);
+
+	return rc;
+
+}
+
 static int test_parsing(void)
 {
-	struct pmu_events_map *cpus_map = perf_pmu__find_map(NULL);
+	struct pmu_events_map *cpus_map = pmu_events_map__find();
 	struct pmu_events_map *map;
 	struct pmu_event *pe;
 	int i, j, k;
@@ -488,7 +554,9 @@ static int test_parsing(void)
 			break;
 		j = 0;
 		for (;;) {
+			struct metric *metric, *tmp;
 			struct hashmap_entry *cur;
+			LIST_HEAD(compound_list);
 			size_t bkt;
 
 			pe = &map->table[j++];
@@ -504,6 +572,13 @@ static int test_parsing(void)
 				continue;
 			}
 
+			if (resolve_metric_simple(&ctx, &compound_list, map,
+						  pe->metric_name)) {
+				expr_failure("Could not resolve metrics", map, pe);
+				ret++;
+				goto exit; /* Don't tolerate errors due to severity */
+			}
+
 			/*
 			 * Add all ids with a made up value. The value may
 			 * trigger divide by zero when subtracted and so try to
@@ -519,6 +594,11 @@ static int test_parsing(void)
 					ret++;
 			}
 
+			list_for_each_entry_safe(metric, tmp, &compound_list, list) {
+				expr__add_ref(&ctx, &metric->metric_ref);
+				free(metric);
+			}
+
 			if (expr__parse(&result, &ctx, pe->metric_expr, 0)) {
 				expr_failure("Parse failed", map, pe);
 				ret++;
@@ -527,6 +607,7 @@ static int test_parsing(void)
 		}
 	}
 	/* TODO: fail when not ok */
+exit:
 	return ret == 0 ? TEST_OK : TEST_SKIP;
 }
 
diff --git a/tools/perf/tests/shell/buildid.sh b/tools/perf/tests/shell/buildid.sh
index 416af614bbe0..f05670d1e39e 100755
--- a/tools/perf/tests/shell/buildid.sh
+++ b/tools/perf/tests/shell/buildid.sh
@@ -14,18 +14,56 @@ if ! [ -x "$(command -v cc)" ]; then
 	exit 2
 fi
 
+# check what we need to test windows binaries
+add_pe=1
+run_pe=1
+if ! perf version --build-options | grep -q 'libbfd: .* on '; then
+	echo "WARNING: perf not built with libbfd. PE binaries will not be tested."
+	add_pe=0
+	run_pe=0
+fi
+if ! which wine > /dev/null; then
+	echo "WARNING: wine not found. PE binaries will not be run."
+	run_pe=0
+fi
+
+# set up wine
+if [ ${run_pe} -eq 1 ]; then
+	wineprefix=$(mktemp -d /tmp/perf.wineprefix.XXX)
+	export WINEPREFIX=${wineprefix}
+	# clear display variables to prevent wine from popping up dialogs
+	unset DISPLAY
+	unset WAYLAND_DISPLAY
+fi
+
 ex_md5=$(mktemp /tmp/perf.ex.MD5.XXX)
 ex_sha1=$(mktemp /tmp/perf.ex.SHA1.XXX)
+ex_pe=$(dirname $0)/../pe-file.exe
 
 echo 'int main(void) { return 0; }' | cc -Wl,--build-id=sha1 -o ${ex_sha1} -x c -
 echo 'int main(void) { return 0; }' | cc -Wl,--build-id=md5 -o ${ex_md5} -x c -
 
-echo "test binaries: ${ex_sha1} ${ex_md5}"
+echo "test binaries: ${ex_sha1} ${ex_md5} ${ex_pe}"
 
 check()
 {
-	id=`readelf -n ${1} 2>/dev/null | grep 'Build ID' | awk '{print $3}'`
-
+	case $1 in
+	*.exe)
+		# We don't have a tool that can pull a nicely formatted build-id out of
+		# a PE file, but we can extract the whole section with objcopy and
+		# format it ourselves. The .buildid section is a Debug Directory
+		# containing a CodeView entry:
+		#     https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#debug-directory-image-only
+		#     https://github.com/dotnet/runtime/blob/da94c022576a5c3bbc0e896f006565905eb137f9/docs/design/specs/PE-COFF.md
+		# The build-id starts at byte 33 and must be rearranged into a GUID.
+		id=`objcopy -O binary --only-section=.buildid $1 /dev/stdout | \
+			cut -c 33-48 | hexdump -ve '/1 "%02x"' | \
+			sed 's@^\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(..\)\(.*\)0a$@\4\3\2\1\6\5\8\7\9@'`
+		;;
+	*)
+		id=`readelf -n ${1} 2>/dev/null | grep 'Build ID' | awk '{print $3}'`
+		;;
+	esac
 	echo "build id: ${id}"
 
 	link=${build_id_dir}/.build-id/${id:0:2}/${id:2}
@@ -50,7 +88,7 @@ check()
 		exit 1
 	fi
 
-	${perf} buildid-cache -l | grep $id
+	${perf} buildid-cache -l | grep ${id}
 	if [ $? -ne 0 ]; then
 		echo "failed: ${id} is not reported by \"perf buildid-cache -l\""
 		exit 1
@@ -79,16 +117,20 @@ test_record()
 {
 	data=$(mktemp /tmp/perf.data.XXX)
 	build_id_dir=$(mktemp -d /tmp/perf.debug.XXX)
+	log=$(mktemp /tmp/perf.log.XXX)
 	perf="perf --buildid-dir ${build_id_dir}"
 
-	${perf} record --buildid-all -o ${data} ${1}
+	echo "running: perf record $@"
+	${perf} record --buildid-all -o ${data} $@ &> ${log}
 	if [ $? -ne 0 ]; then
-		echo "failed: record ${1}"
+		echo "failed: record $@"
+		echo "see log: ${log}"
 		exit 1
 	fi
 
-	check ${1}
+	check ${@: -1}
 
+	rm -f ${log}
 	rm -rf ${build_id_dir}
 	rm -rf ${data}
 }
@@ -96,12 +138,21 @@ test_record()
 # add binaries manual via perf buildid-cache -a
 test_add ${ex_sha1}
 test_add ${ex_md5}
+if [ ${add_pe} -eq 1 ]; then
+	test_add ${ex_pe}
+fi
 
 # add binaries via perf record post processing
 test_record ${ex_sha1}
 test_record ${ex_md5}
+if [ ${run_pe} -eq 1 ]; then
+	test_record wine ${ex_pe}
+fi
 
 # cleanup
 rm ${ex_sha1} ${ex_md5}
+if [ ${run_pe} -eq 1 ]; then
+	rm -r ${wineprefix}
+fi
 
 exit ${err}
diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh
index 58984380b211..45fc24af5b07 100755
--- a/tools/perf/tests/shell/daemon.sh
+++ b/tools/perf/tests/shell/daemon.sh
@@ -98,6 +98,23 @@ check_line_other()
 	fi
 }
 
+daemon_exit()
+{
+	local config=$1
+
+	local line=`perf daemon --config ${config} -x: | head -1`
+	local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'`
+
+	# Reset trap handler.
+	trap - SIGINT SIGTERM
+
+	# stop daemon
+	perf daemon stop --config ${config}
+
+	# ... and wait for the pid to go away
+	tail --pid=${pid} -f /dev/null
+}
+
 daemon_start()
 {
 	local config=$1
@@ -105,29 +122,24 @@ daemon_start()
 
 	perf daemon start --config ${config}
 
+	# Clean up daemon if interrupted.
+	trap "echo 'FAILED: Signal caught'; daemon_exit ${config}; exit 1" SIGINT SIGTERM
+
 	# wait for the session to ping
 	local state="FAIL"
+	local retries=0
 	while [ "${state}" != "OK" ]; do
 		state=`perf daemon ping --config ${config} --session ${session} | awk '{ print $1 }'`
 		sleep 0.05
+		retries=$((${retries} +1))
+		if [ ${retries} -ge 600 ]; then
+			echo "FAILED: Timeout waiting for daemon to ping"
+			daemon_exit ${config}
+			exit 1
+		fi
 	done
 }
 
-daemon_exit()
-{
-	local base=$1
-	local config=$2
-
-	local line=`perf daemon --config ${config} -x: | head -1`
-	local pid=`echo "${line}" | awk 'BEGIN { FS = ":" } ; { print $1 }'`
-
-	# stop daemon
-	perf daemon stop --config ${config}
-
-	# ... and wait for the pid to go away
-	tail --pid=${pid} -f /dev/null
-}
-
 test_list()
 {
 	echo "test daemon list"
@@ -171,7 +183,7 @@ EOF
 			 ${base}/session-time/ack "0"
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	rm -rf ${base}
 	rm -f ${config}
@@ -288,7 +300,7 @@ EOF
 	done
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	rm -rf ${base}
 	rm -f ${config}
@@ -333,7 +345,7 @@ EOF
 	fi
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	# check that sessions are gone
 	if [ -d "/proc/${pid_size}" ]; then
@@ -374,7 +386,7 @@ EOF
 	perf daemon signal --config ${config}
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	# count is 2 perf.data for signals and 1 for perf record finished
 	count=`ls ${base}/session-test/ | grep perf.data | wc -l`
@@ -420,7 +432,7 @@ EOF
 	fi
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	rm -rf ${base}
 	rm -f ${config}
@@ -457,7 +469,7 @@ EOF
 	fi
 
 	# stop daemon
-	daemon_exit ${base} ${config}
+	daemon_exit ${config}
 
 	rm -rf ${base}
 	rm -f ${config}
diff --git a/tools/perf/tests/shell/stat+csv_summary.sh b/tools/perf/tests/shell/stat+csv_summary.sh
new file mode 100755
index 000000000000..5571ff75eb42
--- /dev/null
+++ b/tools/perf/tests/shell/stat+csv_summary.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# perf stat csv summary test
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+#
+#     1.001364330 9224197  cycles 8012885033 100.00
+#         summary 9224197  cycles 8012885033 100.00
+#
+perf stat -e cycles  -x' ' -I1000 --interval-count 1 --summary 2>&1 | \
+grep -e summary | \
+while read summary num event run pct
+do
+	if [ $summary != "summary" ]; then
+		exit 1
+	fi
+done
+
+#
+#     1.001360298 9148534  cycles 8012853854 100.00
+#9148534  cycles 8012853854 100.00
+#
+perf stat -e cycles  -x' ' -I1000 --interval-count 1 --summary --no-csv-summary 2>&1 | \
+grep -e summary | \
+while read num event run pct
+do
+	exit 1
+done
+
+exit 0
diff --git a/tools/perf/tests/shell/stat+shadow_stat.sh b/tools/perf/tests/shell/stat+shadow_stat.sh
index ebebd3596cf9..e6e35fc6c882 100755
--- a/tools/perf/tests/shell/stat+shadow_stat.sh
+++ b/tools/perf/tests/shell/stat+shadow_stat.sh
@@ -7,6 +7,9 @@ set -e
 # skip if system-wide mode is forbidden
 perf stat -a true > /dev/null 2>&1 || exit 2
 
+# skip if on hybrid platform
+perf stat -a -e cycles sleep 1 2>&1 | grep -e cpu_core && exit 2
+
 test_global_aggr()
 {
 	perf stat -a --no-big-num -e cycles,instructions sleep 1  2>&1 | \
diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh
new file mode 100755
index 000000000000..22eb31e48ca7
--- /dev/null
+++ b/tools/perf/tests/shell/stat_bpf_counters.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# perf stat --bpf-counters test
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# check whether $2 is within +/- 10% of $1
+compare_number()
+{
+       first_num=$1
+       second_num=$2
+
+       # upper bound is first_num * 110%
+       upper=$(( $first_num + $first_num / 10 ))
+       # lower bound is first_num * 90%
+       lower=$(( $first_num - $first_num / 10 ))
+
+       if [ $second_num -gt $upper ] || [ $second_num -lt $lower ]; then
+               echo "The difference between $first_num and $second_num are greater than 10%."
+               exit 1
+       fi
+}
+
+# skip if --bpf-counters is not supported
+perf stat --bpf-counters true > /dev/null 2>&1 || exit 2
+
+base_cycles=$(perf stat --no-big-num -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}')
+bpf_cycles=$(perf stat --no-big-num --bpf-counters -e cycles -- perf bench sched messaging -g 1 -l 100 -t 2>&1 | awk '/cycles/ {print $1}')
+
+compare_number $base_cycles $bpf_cycles
+exit 0
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index 3ebaa758df77..62c0ec21aaa8 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -18,6 +18,7 @@
 #include "record.h"
 #include "tests.h"
 #include "util/mmap.h"
+#include "pmu.h"
 
 static int spin_sleep(void)
 {
@@ -371,7 +372,10 @@ int test__switch_tracking(struct test *test __maybe_unused, int subtest __maybe_
 	cpu_clocks_evsel = evlist__last(evlist);
 
 	/* Second event */
-	err = parse_events(evlist, "cycles:u", NULL);
+	if (perf_pmu__has_hybrid())
+		err = parse_events(evlist, "cpu_core/cycles/u", NULL);
+	else
+		err = parse_events(evlist, "cycles:u", NULL);
 	if (err) {
 		pr_debug("Failed to parse event cycles:u\n");
 		goto out_err;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 74748ed75b2c..ec4e3b21b831 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -8,6 +8,7 @@
 #include "session.h"
 #include "evlist.h"
 #include "debug.h"
+#include "pmu.h"
 #include <linux/err.h>
 
 #define TEMPL "/tmp/perf-test-XXXXXX"
@@ -40,8 +41,16 @@ static int session_write_header(char *path)
 	session = perf_session__new(&data, false, NULL);
 	TEST_ASSERT_VAL("can't get session", !IS_ERR(session));
 
-	session->evlist = evlist__new_default();
-	TEST_ASSERT_VAL("can't get evlist", session->evlist);
+	if (!perf_pmu__has_hybrid()) {
+		session->evlist = evlist__new_default();
+		TEST_ASSERT_VAL("can't get evlist", session->evlist);
+	} else {
+		struct parse_events_error err;
+
+		session->evlist = evlist__new();
+		TEST_ASSERT_VAL("can't get evlist", session->evlist);
+		parse_events(session->evlist, "cpu_core/cycles/", &err);
+	}
 
 	perf_header__set_feat(&session->header, HEADER_CPU_TOPOLOGY);
 	perf_header__set_feat(&session->header, HEADER_NRCPUS);
@@ -80,7 +89,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 	 *   CPU 1 is on core_id 1 and physical_package_id 3
 	 *
 	 *   Core_id and physical_package_id are platform and architecture
-	 *   dependend and might have higher numbers than the CPU id.
+	 *   dependent and might have higher numbers than the CPU id.
 	 *   This actually depends on the configuration.
 	 *
 	 *  In this case process_cpu_topology() prints error message:
diff --git a/tools/perf/trace/beauty/fsconfig.sh b/tools/perf/trace/beauty/fsconfig.sh
index 83fb24df05c9..bc6ef7bb7a5f 100755
--- a/tools/perf/trace/beauty/fsconfig.sh
+++ b/tools/perf/trace/beauty/fsconfig.sh
@@ -10,8 +10,7 @@ fi
 linux_mount=${linux_header_dir}/mount.h
 
 printf "static const char *fsconfig_cmds[] = {\n"
-regex='^[[:space:]]*+FSCONFIG_([[:alnum:]_]+)[[:space:]]*=[[:space:]]*([[:digit:]]+)[[:space:]]*,[[:space:]]*.*'
-egrep $regex ${linux_mount} | \
-	sed -r "s/$regex/\2 \1/g"	| \
-	xargs printf "\t[%s] = \"%s\",\n"
+ms='[[:space:]]*'
+sed -nr "s/^${ms}FSCONFIG_([[:alnum:]_]+)${ms}=${ms}([[:digit:]]+)${ms},.*/\t[\2] = \"\1\",/p" \
+	${linux_mount}
 printf "};\n"
diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h
index 385894b4a8bb..b8fc5c53ba6f 100644
--- a/tools/perf/trace/beauty/include/linux/socket.h
+++ b/tools/perf/trace/beauty/include/linux/socket.h
@@ -85,7 +85,7 @@ struct mmsghdr {
 
 /*
  *	POSIX 1003.1g - ancillary data object information
- *	Ancillary data consits of a sequence of pairs of
+ *	Ancillary data consists of a sequence of pairs of
  *	(cmsghdr, cmsg_data[])
  */
 
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 35b82caf8090..ad0a70f0edaf 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -381,6 +381,25 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser)
 	return true;
 }
 
+#define SYM_TITLE_MAX_SIZE (PATH_MAX + 64)
+
+static void annotate_browser__show_full_location(struct ui_browser *browser)
+{
+	struct annotate_browser *ab = container_of(browser, struct annotate_browser, b);
+	struct disasm_line *cursor = disasm_line(ab->selection);
+	struct annotation_line *al = &cursor->al;
+
+	if (al->offset != -1)
+		ui_helpline__puts("Only available for source code lines.");
+	else if (al->fileloc == NULL)
+		ui_helpline__puts("No source file location.");
+	else {
+		char help_line[SYM_TITLE_MAX_SIZE];
+		sprintf (help_line, "Source file location: %s", al->fileloc);
+		ui_helpline__puts(help_line);
+	}
+}
+
 static void ui_browser__init_asm_mode(struct ui_browser *browser)
 {
 	struct annotation *notes = browser__annotation(browser);
@@ -388,8 +407,6 @@ static void ui_browser__init_asm_mode(struct ui_browser *browser)
 	browser->nr_entries = notes->nr_asm_entries;
 }
 
-#define SYM_TITLE_MAX_SIZE (PATH_MAX + 64)
-
 static int sym_title(struct symbol *sym, struct map *map, char *title,
 		     size_t sz, int percent_type)
 {
@@ -398,7 +415,7 @@ static int sym_title(struct symbol *sym, struct map *map, char *title,
 }
 
 /*
- * This can be called from external jumps, i.e. jumps from one functon
+ * This can be called from external jumps, i.e. jumps from one function
  * to another, like from the kernel's entry_SYSCALL_64 function to the
  * swapgs_restore_regs_and_return_to_usermode() function.
  *
@@ -747,6 +764,7 @@ static int annotate_browser__run(struct annotate_browser *browser,
 		"c             Show min/max cycle\n"
 		"/             Search string\n"
 		"k             Toggle line numbers\n"
+		"l             Show full source file location\n"
 		"P             Print to [symbol_name].annotation file.\n"
 		"r             Run available scripts\n"
 		"p             Toggle percent type [local/global]\n"
@@ -760,6 +778,9 @@ static int annotate_browser__run(struct annotate_browser *browser,
 		case 'k':
 			notes->options->show_linenr = !notes->options->show_linenr;
 			continue;
+		case 'l':
+			annotate_browser__show_full_location (&browser->b);
+			continue;
 		case 'H':
 			nd = browser->curr_hot;
 			break;
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 3b9818ee9546..b72ee6822222 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -117,7 +117,7 @@ static void hist_browser__update_rows(struct hist_browser *hb)
 	browser->rows -= browser->extra_title_lines;
 	/*
 	 * Verify if we were at the last line and that line isn't
-	 * visibe because we now show the header line(s).
+	 * visible because we now show the header line(s).
 	 */
 	index_row = browser->index - browser->top_idx;
 	if (index_row >= browser->rows)
@@ -682,6 +682,7 @@ static int hist_browser__handle_hotkey(struct hist_browser *browser, bool warn_l
 	switch (key) {
 	case K_TIMER: {
 		struct hist_browser_timer *hbt = browser->hbt;
+		struct evsel *evsel = hists_to_evsel(browser->hists);
 		u64 nr_entries;
 
 		WARN_ON_ONCE(!hbt);
@@ -696,10 +697,10 @@ static int hist_browser__handle_hotkey(struct hist_browser *browser, bool warn_l
 		ui_browser__update_nr_entries(&browser->b, nr_entries);
 
 		if (warn_lost_event &&
-		    (browser->hists->stats.nr_lost_warned !=
-		    browser->hists->stats.nr_events[PERF_RECORD_LOST])) {
-			browser->hists->stats.nr_lost_warned =
-				browser->hists->stats.nr_events[PERF_RECORD_LOST];
+		    (evsel->evlist->stats.nr_lost_warned !=
+		     evsel->evlist->stats.nr_events[PERF_RECORD_LOST])) {
+			evsel->evlist->stats.nr_lost_warned =
+				evsel->evlist->stats.nr_events[PERF_RECORD_LOST];
 			ui_browser__warn_lost_events(&browser->b);
 		}
 
@@ -3416,7 +3417,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser,
 	struct evsel *evsel = list_entry(entry, struct evsel, core.node);
 	struct hists *hists = evsel__hists(evsel);
 	bool current_entry = ui_browser__is_current_entry(browser, row);
-	unsigned long nr_events = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+	unsigned long nr_events = hists->stats.nr_samples;
 	const char *ev_name = evsel__name(evsel);
 	char bf[256], unit;
 	const char *warn = " ";
@@ -3432,7 +3433,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser,
 
 		for_each_group_member(pos, evsel) {
 			struct hists *pos_hists = evsel__hists(pos);
-			nr_events += pos_hists->stats.nr_events[PERF_RECORD_SAMPLE];
+			nr_events += pos_hists->stats.nr_samples;
 		}
 	}
 
@@ -3441,7 +3442,7 @@ static void perf_evsel_menu__write(struct ui_browser *browser,
 			   unit, unit == ' ' ? "" : " ", ev_name);
 	ui_browser__printf(browser, "%s", bf);
 
-	nr_events = hists->stats.nr_events[PERF_RECORD_LOST];
+	nr_events = evsel->evlist->stats.nr_events[PERF_RECORD_LOST];
 	if (nr_events != 0) {
 		menu->lost_events = true;
 		if (!current_entry)
@@ -3647,7 +3648,7 @@ static int block_hists_browser__title(struct hist_browser *browser, char *bf,
 {
 	struct hists *hists = evsel__hists(browser->block_evsel);
 	const char *evname = evsel__name(browser->block_evsel);
-	unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+	unsigned long nr_samples = hists->stats.nr_samples;
 	int ret;
 
 	ret = scnprintf(bf, size, "# Samples: %lu", nr_samples);
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 2ab2af4d4849..f36270485168 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -897,10 +897,12 @@ out:
 	return ret;
 }
 
-size_t events_stats__fprintf(struct events_stats *stats, FILE *fp)
+size_t events_stats__fprintf(struct events_stats *stats, FILE *fp,
+			     bool skip_empty)
 {
 	int i;
 	size_t ret = 0;
+	u32 total = stats->nr_events[0];
 
 	for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
 		const char *name;
@@ -908,8 +910,17 @@ size_t events_stats__fprintf(struct events_stats *stats, FILE *fp)
 		name = perf_event__name(i);
 		if (!strcmp(name, "UNKNOWN"))
 			continue;
+		if (skip_empty && !stats->nr_events[i])
+			continue;
 
-		ret += fprintf(fp, "%16s events: %10d\n", name, stats->nr_events[i]);
+		if (i && total) {
+			ret += fprintf(fp, "%16s events: %10d  (%4.1f%%)\n",
+				       name, stats->nr_events[i],
+				       100.0 * stats->nr_events[i] / total);
+		} else {
+			ret += fprintf(fp, "%16s events: %10d\n",
+				       name, stats->nr_events[i]);
+		}
 	}
 
 	return ret;
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index e3e12f9d4733..8c0d9f368ebc 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -10,6 +10,7 @@ perf-y += db-export.o
 perf-y += env.o
 perf-y += event.o
 perf-y += evlist.o
+perf-y += evlist-hybrid.o
 perf-y += sideband_evlist.o
 perf-y += evsel.o
 perf-y += evsel_fprintf.o
@@ -23,6 +24,7 @@ perf-y += llvm-utils.o
 perf-y += mmap.o
 perf-y += memswap.o
 perf-y += parse-events.o
+perf-y += parse-events-hybrid.o
 perf-y += perf_regs.o
 perf-y += path.o
 perf-y += print_binary.o
@@ -69,6 +71,7 @@ perf-y += parse-events-bison.o
 perf-y += pmu.o
 perf-y += pmu-flex.o
 perf-y += pmu-bison.o
+perf-y += pmu-hybrid.o
 perf-y += trace-event-read.o
 perf-y += trace-event-info.o
 perf-y += trace-event-scripting.o
@@ -102,6 +105,7 @@ perf-y += rwsem.o
 perf-y += thread-stack.o
 perf-y += spark.o
 perf-y += topdown.o
+perf-y += iostat.o
 perf-y += stream.o
 perf-$(CONFIG_AUXTRACE) += auxtrace.o
 perf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
@@ -164,6 +168,7 @@ perf-$(CONFIG_LIBUNWIND_X86)      += libunwind/x86_32.o
 perf-$(CONFIG_LIBUNWIND_AARCH64)  += libunwind/arm64.o
 
 perf-$(CONFIG_LIBBABELTRACE) += data-convert-bt.o
+perf-y += data-convert-json.o
 
 perf-y += scripting-engines/
 
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index e60841b86d27..abe1499a9164 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1161,6 +1161,7 @@ struct annotate_args {
 	s64			  offset;
 	char			  *line;
 	int			  line_nr;
+	char			  *fileloc;
 };
 
 static void annotation_line__init(struct annotation_line *al,
@@ -1170,6 +1171,7 @@ static void annotation_line__init(struct annotation_line *al,
 	al->offset = args->offset;
 	al->line = strdup(args->line);
 	al->line_nr = args->line_nr;
+	al->fileloc = args->fileloc;
 	al->data_nr = nr;
 }
 
@@ -1366,7 +1368,6 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
 {
 	struct disasm_line *dl = container_of(al, struct disasm_line, al);
 	static const char *prev_line;
-	static const char *prev_color;
 
 	if (al->offset != -1) {
 		double max_percent = 0.0;
@@ -1405,20 +1406,6 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
 
 		color = get_percent_color(max_percent);
 
-		/*
-		 * Also color the filename and line if needed, with
-		 * the same color than the percentage. Don't print it
-		 * twice for close colored addr with the same filename:line
-		 */
-		if (al->path) {
-			if (!prev_line || strcmp(prev_line, al->path)
-				       || color != prev_color) {
-				color_fprintf(stdout, color, " %s", al->path);
-				prev_line = al->path;
-				prev_color = color;
-			}
-		}
-
 		for (i = 0; i < nr_percent; i++) {
 			struct annotation_data *data = &al->data[i];
 			double percent;
@@ -1439,6 +1426,19 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
 		printf(" : ");
 
 		disasm_line__print(dl, start, addr_fmt_width);
+
+		/*
+		 * Also color the filename and line if needed, with
+		 * the same color than the percentage. Don't print it
+		 * twice for close colored addr with the same filename:line
+		 */
+		if (al->path) {
+			if (!prev_line || strcmp(prev_line, al->path)) {
+				color_fprintf(stdout, color, " // %s", al->path);
+				prev_line = al->path;
+			}
+		}
+
 		printf("\n");
 	} else if (max_lines && printed >= max_lines)
 		return 1;
@@ -1454,7 +1454,7 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
 		if (!*al->line)
 			printf(" %*s:\n", width, " ");
 		else
-			printf(" %*s:     %*s %s\n", width, " ", addr_fmt_width, " ", al->line);
+			printf(" %*s: %-*d %s\n", width, " ", addr_fmt_width, al->line_nr, al->line);
 	}
 
 	return 0;
@@ -1482,7 +1482,7 @@ annotation_line__print(struct annotation_line *al, struct symbol *sym, u64 start
  */
 static int symbol__parse_objdump_line(struct symbol *sym,
 				      struct annotate_args *args,
-				      char *parsed_line, int *line_nr)
+				      char *parsed_line, int *line_nr, char **fileloc)
 {
 	struct map *map = args->ms.map;
 	struct annotation *notes = symbol__annotation(sym);
@@ -1494,6 +1494,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 	/* /filename:linenr ? Save line number and ignore. */
 	if (regexec(&file_lineno, parsed_line, 2, match, 0) == 0) {
 		*line_nr = atoi(parsed_line + match[1].rm_so);
+		*fileloc = strdup(parsed_line);
 		return 0;
 	}
 
@@ -1513,6 +1514,7 @@ static int symbol__parse_objdump_line(struct symbol *sym,
 	args->offset  = offset;
 	args->line    = parsed_line;
 	args->line_nr = *line_nr;
+	args->fileloc = *fileloc;
 	args->ms.sym  = sym;
 
 	dl = disasm_line__new(args);
@@ -1807,6 +1809,7 @@ static int symbol__disassemble_bpf(struct symbol *sym,
 			args->offset = -1;
 			args->line = strdup(srcline);
 			args->line_nr = 0;
+			args->fileloc = NULL;
 			args->ms.sym  = sym;
 			dl = disasm_line__new(args);
 			if (dl) {
@@ -1818,6 +1821,7 @@ static int symbol__disassemble_bpf(struct symbol *sym,
 		args->offset = pc;
 		args->line = buf + prev_buf_size;
 		args->line_nr = 0;
+		args->fileloc = NULL;
 		args->ms.sym  = sym;
 		dl = disasm_line__new(args);
 		if (dl)
@@ -1852,6 +1856,7 @@ symbol__disassemble_bpf_image(struct symbol *sym,
 	args->offset = -1;
 	args->line = strdup("to be implemented");
 	args->line_nr = 0;
+	args->fileloc = NULL;
 	dl = disasm_line__new(args);
 	if (dl)
 		annotation_line__add(&dl->al, &notes->src->source);
@@ -1933,6 +1938,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 	bool delete_extract = false;
 	bool decomp = false;
 	int lineno = 0;
+	char *fileloc = NULL;
 	int nline;
 	char *line;
 	size_t line_len;
@@ -2060,7 +2066,7 @@ static int symbol__disassemble(struct symbol *sym, struct annotate_args *args)
 		 * See disasm_line__new() and struct disasm_line::line_nr.
 		 */
 		if (symbol__parse_objdump_line(sym, args, expanded_line,
-					       &lineno) < 0)
+					       &lineno, &fileloc) < 0)
 			break;
 		nline++;
 	}
@@ -3144,6 +3150,10 @@ static int annotation__config(const char *var, const char *value, void *data)
 		opt->use_offset = perf_config_bool("use_offset", value);
 	} else if (!strcmp(var, "annotate.disassembler_style")) {
 		opt->disassembler_style = value;
+	} else if (!strcmp(var, "annotate.demangle")) {
+		symbol_conf.demangle = perf_config_bool("demangle", value);
+	} else if (!strcmp(var, "annotate.demangle_kernel")) {
+		symbol_conf.demangle_kernel = perf_config_bool("demangle_kernel", value);
 	} else {
 		pr_debug("%s variable unknown, ignoring...", var);
 	}
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 096cdaf21b01..3757416bcf46 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -84,6 +84,7 @@ struct annotation_options {
 	     print_lines,
 	     full_path,
 	     show_linenr,
+	     show_fileloc,
 	     show_nr_jumps,
 	     show_minmax_cycle,
 	     show_asm_raw,
@@ -136,6 +137,7 @@ struct annotation_line {
 	s64			 offset;
 	char			*line;
 	int			 line_nr;
+	char			*fileloc;
 	int			 jump_sources;
 	float			 ipc;
 	u64			 cycles;
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 9087f1bffd3d..fbb3c4057c30 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -671,7 +671,7 @@ int bpf__probe(struct bpf_object *obj)
 		 * After probing, let's consider prologue, which
 		 * adds program fetcher to BPF programs.
 		 *
-		 * hook_load_preprocessorr() hooks pre-processor
+		 * hook_load_preprocessor() hooks pre-processor
 		 * to bpf_program, let it generate prologue
 		 * dynamically during loading.
 		 */
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index 04f89120b323..ddb52f748c8e 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -5,6 +5,7 @@
 #include <assert.h>
 #include <limits.h>
 #include <unistd.h>
+#include <sys/file.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <linux/err.h>
@@ -12,14 +13,24 @@
 #include <bpf/bpf.h>
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
+#include <api/fs/fs.h>
+#include <perf/bpf_perf.h>
 
 #include "bpf_counter.h"
 #include "counts.h"
 #include "debug.h"
 #include "evsel.h"
+#include "evlist.h"
 #include "target.h"
+#include "cpumap.h"
+#include "thread_map.h"
 
 #include "bpf_skel/bpf_prog_profiler.skel.h"
+#include "bpf_skel/bperf_u.h"
+#include "bpf_skel/bperf_leader.skel.h"
+#include "bpf_skel/bperf_follower.skel.h"
+
+#define ATTR_MAP_SIZE 16
 
 static inline void *u64_to_ptr(__u64 ptr)
 {
@@ -204,6 +215,17 @@ static int bpf_program_profiler__enable(struct evsel *evsel)
 	return 0;
 }
 
+static int bpf_program_profiler__disable(struct evsel *evsel)
+{
+	struct bpf_counter *counter;
+
+	list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
+		assert(counter->skel != NULL);
+		bpf_prog_profiler_bpf__detach(counter->skel);
+	}
+	return 0;
+}
+
 static int bpf_program_profiler__read(struct evsel *evsel)
 {
 	// perf_cpu_map uses /sys/devices/system/cpu/online
@@ -269,22 +291,527 @@ static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu,
 struct bpf_counter_ops bpf_program_profiler_ops = {
 	.load       = bpf_program_profiler__load,
 	.enable	    = bpf_program_profiler__enable,
+	.disable    = bpf_program_profiler__disable,
 	.read       = bpf_program_profiler__read,
 	.destroy    = bpf_program_profiler__destroy,
 	.install_pe = bpf_program_profiler__install_pe,
 };
 
+static __u32 bpf_link_get_id(int fd)
+{
+	struct bpf_link_info link_info = {0};
+	__u32 link_info_len = sizeof(link_info);
+
+	bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
+	return link_info.id;
+}
+
+static __u32 bpf_link_get_prog_id(int fd)
+{
+	struct bpf_link_info link_info = {0};
+	__u32 link_info_len = sizeof(link_info);
+
+	bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len);
+	return link_info.prog_id;
+}
+
+static __u32 bpf_map_get_id(int fd)
+{
+	struct bpf_map_info map_info = {0};
+	__u32 map_info_len = sizeof(map_info);
+
+	bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len);
+	return map_info.id;
+}
+
+static bool bperf_attr_map_compatible(int attr_map_fd)
+{
+	struct bpf_map_info map_info = {0};
+	__u32 map_info_len = sizeof(map_info);
+	int err;
+
+	err = bpf_obj_get_info_by_fd(attr_map_fd, &map_info, &map_info_len);
+
+	if (err)
+		return false;
+	return (map_info.key_size == sizeof(struct perf_event_attr)) &&
+		(map_info.value_size == sizeof(struct perf_event_attr_map_entry));
+}
+
+static int bperf_lock_attr_map(struct target *target)
+{
+	char path[PATH_MAX];
+	int map_fd, err;
+
+	if (target->attr_map) {
+		scnprintf(path, PATH_MAX, "%s", target->attr_map);
+	} else {
+		scnprintf(path, PATH_MAX, "%s/fs/bpf/%s", sysfs__mountpoint(),
+			  BPF_PERF_DEFAULT_ATTR_MAP_PATH);
+	}
+
+	if (access(path, F_OK)) {
+		map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
+					sizeof(struct perf_event_attr),
+					sizeof(struct perf_event_attr_map_entry),
+					ATTR_MAP_SIZE, 0);
+		if (map_fd < 0)
+			return -1;
+
+		err = bpf_obj_pin(map_fd, path);
+		if (err) {
+			/* someone pinned the map in parallel? */
+			close(map_fd);
+			map_fd = bpf_obj_get(path);
+			if (map_fd < 0)
+				return -1;
+		}
+	} else {
+		map_fd = bpf_obj_get(path);
+		if (map_fd < 0)
+			return -1;
+	}
+
+	if (!bperf_attr_map_compatible(map_fd)) {
+		close(map_fd);
+		return -1;
+
+	}
+	err = flock(map_fd, LOCK_EX);
+	if (err) {
+		close(map_fd);
+		return -1;
+	}
+	return map_fd;
+}
+
+/* trigger the leader program on a cpu */
+static int bperf_trigger_reading(int prog_fd, int cpu)
+{
+	DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+			    .ctx_in = NULL,
+			    .ctx_size_in = 0,
+			    .flags = BPF_F_TEST_RUN_ON_CPU,
+			    .cpu = cpu,
+			    .retval = 0,
+		);
+
+	return bpf_prog_test_run_opts(prog_fd, &opts);
+}
+
+static int bperf_check_target(struct evsel *evsel,
+			      struct target *target,
+			      enum bperf_filter_type *filter_type,
+			      __u32 *filter_entry_cnt)
+{
+	if (evsel->leader->core.nr_members > 1) {
+		pr_err("bpf managed perf events do not yet support groups.\n");
+		return -1;
+	}
+
+	/* determine filter type based on target */
+	if (target->system_wide) {
+		*filter_type = BPERF_FILTER_GLOBAL;
+		*filter_entry_cnt = 1;
+	} else if (target->cpu_list) {
+		*filter_type = BPERF_FILTER_CPU;
+		*filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel));
+	} else if (target->tid) {
+		*filter_type = BPERF_FILTER_PID;
+		*filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
+	} else if (target->pid || evsel->evlist->workload.pid != -1) {
+		*filter_type = BPERF_FILTER_TGID;
+		*filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
+	} else {
+		pr_err("bpf managed perf events do not yet support these targets.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static	struct perf_cpu_map *all_cpu_map;
+
+static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
+				       struct perf_event_attr_map_entry *entry)
+{
+	struct bperf_leader_bpf *skel = bperf_leader_bpf__open();
+	int link_fd, diff_map_fd, err;
+	struct bpf_link *link = NULL;
+
+	if (!skel) {
+		pr_err("Failed to open leader skeleton\n");
+		return -1;
+	}
+
+	bpf_map__resize(skel->maps.events, libbpf_num_possible_cpus());
+	err = bperf_leader_bpf__load(skel);
+	if (err) {
+		pr_err("Failed to load leader skeleton\n");
+		goto out;
+	}
+
+	err = -1;
+	link = bpf_program__attach(skel->progs.on_switch);
+	if (!link) {
+		pr_err("Failed to attach leader program\n");
+		goto out;
+	}
+
+	link_fd = bpf_link__fd(link);
+	diff_map_fd = bpf_map__fd(skel->maps.diff_readings);
+	entry->link_id = bpf_link_get_id(link_fd);
+	entry->diff_map_id = bpf_map_get_id(diff_map_fd);
+	err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY);
+	assert(err == 0);
+
+	evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id);
+	assert(evsel->bperf_leader_link_fd >= 0);
+
+	/*
+	 * save leader_skel for install_pe, which is called within
+	 * following evsel__open_per_cpu call
+	 */
+	evsel->leader_skel = skel;
+	evsel__open_per_cpu(evsel, all_cpu_map, -1);
+
+out:
+	bperf_leader_bpf__destroy(skel);
+	bpf_link__destroy(link);
+	return err;
+}
+
+static int bperf__load(struct evsel *evsel, struct target *target)
+{
+	struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
+	int attr_map_fd, diff_map_fd = -1, err;
+	enum bperf_filter_type filter_type;
+	__u32 filter_entry_cnt, i;
+
+	if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
+		return -1;
+
+	if (!all_cpu_map) {
+		all_cpu_map = perf_cpu_map__new(NULL);
+		if (!all_cpu_map)
+			return -1;
+	}
+
+	evsel->bperf_leader_prog_fd = -1;
+	evsel->bperf_leader_link_fd = -1;
+
+	/*
+	 * Step 1: hold a fd on the leader program and the bpf_link, if
+	 * the program is not already gone, reload the program.
+	 * Use flock() to ensure exclusive access to the perf_event_attr
+	 * map.
+	 */
+	attr_map_fd = bperf_lock_attr_map(target);
+	if (attr_map_fd < 0) {
+		pr_err("Failed to lock perf_event_attr map\n");
+		return -1;
+	}
+
+	err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry);
+	if (err) {
+		err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY);
+		if (err)
+			goto out;
+	}
+
+	evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id);
+	if (evsel->bperf_leader_link_fd < 0 &&
+	    bperf_reload_leader_program(evsel, attr_map_fd, &entry))
+		goto out;
+
+	/*
+	 * The bpf_link holds reference to the leader program, and the
+	 * leader program holds reference to the maps. Therefore, if
+	 * link_id is valid, diff_map_id should also be valid.
+	 */
+	evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id(
+		bpf_link_get_prog_id(evsel->bperf_leader_link_fd));
+	assert(evsel->bperf_leader_prog_fd >= 0);
+
+	diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id);
+	assert(diff_map_fd >= 0);
+
+	/*
+	 * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check
+	 * whether the kernel support it
+	 */
+	err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0);
+	if (err) {
+		pr_err("The kernel does not support test_run for raw_tp BPF programs.\n"
+		       "Therefore, --use-bpf might show inaccurate readings\n");
+		goto out;
+	}
+
+	/* Step 2: load the follower skeleton */
+	evsel->follower_skel = bperf_follower_bpf__open();
+	if (!evsel->follower_skel) {
+		pr_err("Failed to open follower skeleton\n");
+		goto out;
+	}
+
+	/* attach fexit program to the leader program */
+	bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX,
+				       evsel->bperf_leader_prog_fd, "on_switch");
+
+	/* connect to leader diff_reading map */
+	bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd);
+
+	/* set up reading map */
+	bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
+				 filter_entry_cnt);
+	/* set up follower filter based on target */
+	bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
+				 filter_entry_cnt);
+	err = bperf_follower_bpf__load(evsel->follower_skel);
+	if (err) {
+		pr_err("Failed to load follower skeleton\n");
+		bperf_follower_bpf__destroy(evsel->follower_skel);
+		evsel->follower_skel = NULL;
+		goto out;
+	}
+
+	for (i = 0; i < filter_entry_cnt; i++) {
+		int filter_map_fd;
+		__u32 key;
+
+		if (filter_type == BPERF_FILTER_PID ||
+		    filter_type == BPERF_FILTER_TGID)
+			key = evsel->core.threads->map[i].pid;
+		else if (filter_type == BPERF_FILTER_CPU)
+			key = evsel->core.cpus->map[i];
+		else
+			break;
+
+		filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
+		bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
+	}
+
+	evsel->follower_skel->bss->type = filter_type;
+
+	err = bperf_follower_bpf__attach(evsel->follower_skel);
+
+out:
+	if (err && evsel->bperf_leader_link_fd >= 0)
+		close(evsel->bperf_leader_link_fd);
+	if (err && evsel->bperf_leader_prog_fd >= 0)
+		close(evsel->bperf_leader_prog_fd);
+	if (diff_map_fd >= 0)
+		close(diff_map_fd);
+
+	flock(attr_map_fd, LOCK_UN);
+	close(attr_map_fd);
+
+	return err;
+}
+
+static int bperf__install_pe(struct evsel *evsel, int cpu, int fd)
+{
+	struct bperf_leader_bpf *skel = evsel->leader_skel;
+
+	return bpf_map_update_elem(bpf_map__fd(skel->maps.events),
+				   &cpu, &fd, BPF_ANY);
+}
+
+/*
+ * trigger the leader prog on each cpu, so the accum_reading map could get
+ * the latest readings.
+ */
+static int bperf_sync_counters(struct evsel *evsel)
+{
+	int num_cpu, i, cpu;
+
+	num_cpu = all_cpu_map->nr;
+	for (i = 0; i < num_cpu; i++) {
+		cpu = all_cpu_map->map[i];
+		bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu);
+	}
+	return 0;
+}
+
+static int bperf__enable(struct evsel *evsel)
+{
+	evsel->follower_skel->bss->enabled = 1;
+	return 0;
+}
+
+static int bperf__disable(struct evsel *evsel)
+{
+	evsel->follower_skel->bss->enabled = 0;
+	return 0;
+}
+
+static int bperf__read(struct evsel *evsel)
+{
+	struct bperf_follower_bpf *skel = evsel->follower_skel;
+	__u32 num_cpu_bpf = cpu__max_cpu();
+	struct bpf_perf_event_value values[num_cpu_bpf];
+	int reading_map_fd, err = 0;
+	__u32 i, j, num_cpu;
+
+	bperf_sync_counters(evsel);
+	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
+
+	for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
+		__u32 cpu;
+
+		err = bpf_map_lookup_elem(reading_map_fd, &i, values);
+		if (err)
+			goto out;
+		switch (evsel->follower_skel->bss->type) {
+		case BPERF_FILTER_GLOBAL:
+			assert(i == 0);
+
+			num_cpu = all_cpu_map->nr;
+			for (j = 0; j < num_cpu; j++) {
+				cpu = all_cpu_map->map[j];
+				perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter;
+				perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled;
+				perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running;
+			}
+			break;
+		case BPERF_FILTER_CPU:
+			cpu = evsel->core.cpus->map[i];
+			perf_counts(evsel->counts, i, 0)->val = values[cpu].counter;
+			perf_counts(evsel->counts, i, 0)->ena = values[cpu].enabled;
+			perf_counts(evsel->counts, i, 0)->run = values[cpu].running;
+			break;
+		case BPERF_FILTER_PID:
+		case BPERF_FILTER_TGID:
+			perf_counts(evsel->counts, 0, i)->val = 0;
+			perf_counts(evsel->counts, 0, i)->ena = 0;
+			perf_counts(evsel->counts, 0, i)->run = 0;
+
+			for (cpu = 0; cpu < num_cpu_bpf; cpu++) {
+				perf_counts(evsel->counts, 0, i)->val += values[cpu].counter;
+				perf_counts(evsel->counts, 0, i)->ena += values[cpu].enabled;
+				perf_counts(evsel->counts, 0, i)->run += values[cpu].running;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+out:
+	return err;
+}
+
+static int bperf__destroy(struct evsel *evsel)
+{
+	bperf_follower_bpf__destroy(evsel->follower_skel);
+	close(evsel->bperf_leader_prog_fd);
+	close(evsel->bperf_leader_link_fd);
+	return 0;
+}
+
+/*
+ * bperf: share hardware PMCs with BPF
+ *
+ * perf uses performance monitoring counters (PMC) to monitor system
+ * performance. The PMCs are limited hardware resources. For example,
+ * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu.
+ *
+ * Modern data center systems use these PMCs in many different ways:
+ * system level monitoring, (maybe nested) container level monitoring, per
+ * process monitoring, profiling (in sample mode), etc. In some cases,
+ * there are more active perf_events than available hardware PMCs. To allow
+ * all perf_events to have a chance to run, it is necessary to do expensive
+ * time multiplexing of events.
+ *
+ * On the other hand, many monitoring tools count the common metrics
+ * (cycles, instructions). It is a waste to have multiple tools create
+ * multiple perf_events of "cycles" and occupy multiple PMCs.
+ *
+ * bperf tries to reduce such wastes by allowing multiple perf_events of
+ * "cycles" or "instructions" (at different scopes) to share PMUs. Instead
+ * of having each perf-stat session to read its own perf_events, bperf uses
+ * BPF programs to read the perf_events and aggregate readings to BPF maps.
+ * Then, the perf-stat session(s) reads the values from these BPF maps.
+ *
+ *                                ||
+ *       shared progs and maps <- || -> per session progs and maps
+ *                                ||
+ *   ---------------              ||
+ *   | perf_events |              ||
+ *   ---------------       fexit  ||      -----------------
+ *          |             --------||----> | follower prog |
+ *       --------------- /        || ---  -----------------
+ * cs -> | leader prog |/         ||/        |         |
+ *   --> ---------------         /||  --------------  ------------------
+ *  /       |         |         / ||  | filter map |  | accum_readings |
+ * /  ------------  ------------  ||  --------------  ------------------
+ * |  | prev map |  | diff map |  ||                        |
+ * |  ------------  ------------  ||                        |
+ *  \                             ||                        |
+ * = \ ==================================================== | ============
+ *    \                                                    /   user space
+ *     \                                                  /
+ *      \                                                /
+ *    BPF_PROG_TEST_RUN                    BPF_MAP_LOOKUP_ELEM
+ *        \                                            /
+ *         \                                          /
+ *          \------  perf-stat ----------------------/
+ *
+ * The figure above shows the architecture of bperf. Note that the figure
+ * is divided into 3 regions: shared progs and maps (top left), per session
+ * progs and maps (top right), and user space (bottom).
+ *
+ * The leader prog is triggered on each context switch (cs). The leader
+ * prog reads perf_events and stores the difference (current_reading -
+ * previous_reading) to the diff map. For the same metric, e.g. "cycles",
+ * multiple perf-stat sessions share the same leader prog.
+ *
+ * Each perf-stat session creates a follower prog as fexit program to the
+ * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38)
+ * follower progs to the same leader prog. The follower prog checks current
+ * task and processor ID to decide whether to add the value from the diff
+ * map to its accumulated reading map (accum_readings).
+ *
+ * Finally, perf-stat user space reads the value from accum_reading map.
+ *
+ * Besides context switch, it is also necessary to trigger the leader prog
+ * before perf-stat reads the value. Otherwise, the accum_reading map may
+ * not have the latest reading from the perf_events. This is achieved by
+ * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU.
+ *
+ * Comment before the definition of struct perf_event_attr_map_entry
+ * describes how different sessions of perf-stat share information about
+ * the leader prog.
+ */
+
+struct bpf_counter_ops bperf_ops = {
+	.load       = bperf__load,
+	.enable     = bperf__enable,
+	.disable    = bperf__disable,
+	.read       = bperf__read,
+	.install_pe = bperf__install_pe,
+	.destroy    = bperf__destroy,
+};
+
+static inline bool bpf_counter_skip(struct evsel *evsel)
+{
+	return list_empty(&evsel->bpf_counter_list) &&
+		evsel->follower_skel == NULL;
+}
+
 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return 0;
 	return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd);
 }
 
 int bpf_counter__load(struct evsel *evsel, struct target *target)
 {
-	if (target__has_bpf(target))
+	if (target->bpf_str)
 		evsel->bpf_counter_ops = &bpf_program_profiler_ops;
+	else if (target->use_bpf || evsel->bpf_counter ||
+		 evsel__match_bpf_counter_events(evsel->name))
+		evsel->bpf_counter_ops = &bperf_ops;
 
 	if (evsel->bpf_counter_ops)
 		return evsel->bpf_counter_ops->load(evsel, target);
@@ -293,21 +820,28 @@ int bpf_counter__load(struct evsel *evsel, struct target *target)
 
 int bpf_counter__enable(struct evsel *evsel)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return 0;
 	return evsel->bpf_counter_ops->enable(evsel);
 }
 
+int bpf_counter__disable(struct evsel *evsel)
+{
+	if (bpf_counter_skip(evsel))
+		return 0;
+	return evsel->bpf_counter_ops->disable(evsel);
+}
+
 int bpf_counter__read(struct evsel *evsel)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return -EAGAIN;
 	return evsel->bpf_counter_ops->read(evsel);
 }
 
 void bpf_counter__destroy(struct evsel *evsel)
 {
-	if (list_empty(&evsel->bpf_counter_list))
+	if (bpf_counter_skip(evsel))
 		return;
 	evsel->bpf_counter_ops->destroy(evsel);
 	evsel->bpf_counter_ops = NULL;
diff --git a/tools/perf/util/bpf_counter.h b/tools/perf/util/bpf_counter.h
index 2eca210e5dc1..d6d907c3dcf9 100644
--- a/tools/perf/util/bpf_counter.h
+++ b/tools/perf/util/bpf_counter.h
@@ -18,6 +18,7 @@ typedef int (*bpf_counter_evsel_install_pe_op)(struct evsel *evsel,
 struct bpf_counter_ops {
 	bpf_counter_evsel_target_op load;
 	bpf_counter_evsel_op enable;
+	bpf_counter_evsel_op disable;
 	bpf_counter_evsel_op read;
 	bpf_counter_evsel_op destroy;
 	bpf_counter_evsel_install_pe_op install_pe;
@@ -32,13 +33,14 @@ struct bpf_counter {
 
 int bpf_counter__load(struct evsel *evsel, struct target *target);
 int bpf_counter__enable(struct evsel *evsel);
+int bpf_counter__disable(struct evsel *evsel);
 int bpf_counter__read(struct evsel *evsel);
 void bpf_counter__destroy(struct evsel *evsel);
 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd);
 
 #else /* HAVE_BPF_SKEL */
 
-#include<linux/err.h>
+#include <linux/err.h>
 
 static inline int bpf_counter__load(struct evsel *evsel __maybe_unused,
 				    struct target *target __maybe_unused)
@@ -51,6 +53,11 @@ static inline int bpf_counter__enable(struct evsel *evsel __maybe_unused)
 	return 0;
 }
 
+static inline int bpf_counter__disable(struct evsel *evsel __maybe_unused)
+{
+	return 0;
+}
+
 static inline int bpf_counter__read(struct evsel *evsel __maybe_unused)
 {
 	return -EAGAIN;
diff --git a/tools/perf/util/bpf_skel/bperf.h b/tools/perf/util/bpf_skel/bperf.h
new file mode 100644
index 000000000000..186a5551ddb9
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+
+#ifndef __BPERF_STAT_H
+#define __BPERF_STAT_H
+
+typedef struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(struct bpf_perf_event_value));
+	__uint(max_entries, 1);
+} reading_map;
+
+#endif /* __BPERF_STAT_H */
diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
new file mode 100644
index 000000000000..b8fa3cb2da23
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bperf.h"
+#include "bperf_u.h"
+
+reading_map diff_readings SEC(".maps");
+reading_map accum_readings SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(__u32));
+} filter SEC(".maps");
+
+enum bperf_filter_type type = 0;
+int enabled = 0;
+
+SEC("fexit/XXX")
+int BPF_PROG(fexit_XXX)
+{
+	struct bpf_perf_event_value *diff_val, *accum_val;
+	__u32 filter_key, zero = 0;
+	__u32 *accum_key;
+
+	if (!enabled)
+		return 0;
+
+	switch (type) {
+	case BPERF_FILTER_GLOBAL:
+		accum_key = &zero;
+		goto do_add;
+	case BPERF_FILTER_CPU:
+		filter_key = bpf_get_smp_processor_id();
+		break;
+	case BPERF_FILTER_PID:
+		filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
+		break;
+	case BPERF_FILTER_TGID:
+		filter_key = bpf_get_current_pid_tgid() >> 32;
+		break;
+	default:
+		return 0;
+	}
+
+	accum_key = bpf_map_lookup_elem(&filter, &filter_key);
+	if (!accum_key)
+		return 0;
+
+do_add:
+	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+	if (!diff_val)
+		return 0;
+
+	accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
+	if (!accum_val)
+		return 0;
+
+	accum_val->counter += diff_val->counter;
+	accum_val->enabled += diff_val->enabled;
+	accum_val->running += diff_val->running;
+
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
new file mode 100644
index 000000000000..4f70d1459e86
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bperf.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(int));
+	__uint(map_flags, BPF_F_PRESERVE_ELEMS);
+} events SEC(".maps");
+
+reading_map prev_readings SEC(".maps");
+reading_map diff_readings SEC(".maps");
+
+SEC("raw_tp/sched_switch")
+int BPF_PROG(on_switch)
+{
+	struct bpf_perf_event_value val, *prev_val, *diff_val;
+	__u32 key = bpf_get_smp_processor_id();
+	__u32 zero = 0;
+	long err;
+
+	prev_val = bpf_map_lookup_elem(&prev_readings, &zero);
+	if (!prev_val)
+		return 0;
+
+	diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+	if (!diff_val)
+		return 0;
+
+	err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
+	if (err)
+		return 0;
+
+	diff_val->counter = val.counter - prev_val->counter;
+	diff_val->enabled = val.enabled - prev_val->enabled;
+	diff_val->running = val.running - prev_val->running;
+	*prev_val = val;
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
new file mode 100644
index 000000000000..1ce0c2c905c1
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_u.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+
+#ifndef __BPERF_STAT_U_H
+#define __BPERF_STAT_U_H
+
+enum bperf_filter_type {
+	BPERF_FILTER_GLOBAL = 1,
+	BPERF_FILTER_CPU,
+	BPERF_FILTER_PID,
+	BPERF_FILTER_TGID,
+};
+
+#endif /* __BPERF_STAT_U_H */
diff --git a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
index c7cec92d0236..ab12b4c4ece2 100644
--- a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
+++ b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
@@ -52,7 +52,7 @@ int BPF_PROG(fentry_XXX)
 static inline void
 fexit_update_maps(struct bpf_perf_event_value *after)
 {
-	struct bpf_perf_event_value *before, diff, *accum;
+	struct bpf_perf_event_value *before, diff;
 	__u32 zero = 0;
 
 	before = bpf_map_lookup_elem(&fentry_readings, &zero);
@@ -78,7 +78,6 @@ int BPF_PROG(fexit_XXX)
 {
 	struct bpf_perf_event_value reading;
 	__u32 cpu = bpf_get_smp_processor_id();
-	__u32 one = 1, zero = 0;
 	int err;
 
 	/* read all events before updating the maps, to reduce error */
diff --git a/tools/perf/util/call-path.h b/tools/perf/util/call-path.h
index 6b3229106f16..5875cfc8106e 100644
--- a/tools/perf/util/call-path.h
+++ b/tools/perf/util/call-path.h
@@ -23,7 +23,7 @@
  * @children: tree of call paths of functions called
  *
  * In combination with the call_return structure, the call_path structure
- * defines a context-sensitve call-graph.
+ * defines a context-sensitive call-graph.
  */
 struct call_path {
 	struct call_path *parent;
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 1b60985690bb..8e2777133bd9 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -877,7 +877,7 @@ append_chain_children(struct callchain_node *root,
 	if (!node)
 		return -1;
 
-	/* lookup in childrens */
+	/* lookup in children */
 	while (*p) {
 		enum match_result ret;
 
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 6984c77068a3..63d472b336de 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -18,6 +18,7 @@
 #include "util/hist.h"  /* perf_hist_config */
 #include "util/llvm-utils.h"   /* perf_llvm_config */
 #include "util/stat.h"  /* perf_stat__set_big_num */
+#include "util/evsel.h"  /* evsel__hw_names, evsel__use_bpf_counters */
 #include "build-id.h"
 #include "debug.h"
 #include "config.h"
@@ -457,6 +458,12 @@ static int perf_stat_config(const char *var, const char *value)
 	if (!strcmp(var, "stat.big-num"))
 		perf_stat__set_big_num(perf_config_bool(var, value));
 
+	if (!strcmp(var, "stat.no-csv-summary"))
+		perf_stat__set_no_csv_summary(perf_config_bool(var, value));
+
+	if (!strcmp(var, "stat.bpf-counter-events"))
+		evsel__bpf_counter_events = strdup(value);
+
 	/* Add other config variables here. */
 	return 0;
 }
@@ -699,7 +706,7 @@ static int collect_config(const char *var, const char *value,
 	/* perf_config_set can contain both user and system config items.
 	 * So we should know where each value is from.
 	 * The classification would be needed when a particular config file
-	 * is overwrited by setting feature i.e. set_config().
+	 * is overwritten by setting feature i.e. set_config().
 	 */
 	if (strcmp(config_file_name, perf_etc_perfconfig()) == 0) {
 		section->from_system_config = true;
diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 3f4bc4050477..059bcec3f651 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -6,6 +6,7 @@
  * Author: Mathieu Poirier <mathieu.poirier@linaro.org>
  */
 
+#include <linux/coresight-pmu.h>
 #include <linux/err.h>
 #include <linux/list.h>
 #include <linux/zalloc.h>
@@ -316,7 +317,7 @@ cs_etm_decoder__do_hard_timestamp(struct cs_etm_queue *etmq,
 	 * This is the first timestamp we've seen since the beginning of traces
 	 * or a discontinuity.  Since timestamps packets are generated *after*
 	 * range packets have been generated, we need to estimate the time at
-	 * which instructions started by substracting the number of instructions
+	 * which instructions started by subtracting the number of instructions
 	 * executed to the timestamp.
 	 */
 	packet_queue->timestamp = elem->timestamp - packet_queue->instr_count;
@@ -491,13 +492,42 @@ cs_etm_decoder__set_tid(struct cs_etm_queue *etmq,
 			const ocsd_generic_trace_elem *elem,
 			const uint8_t trace_chan_id)
 {
-	pid_t tid;
+	pid_t tid = -1;
+	static u64 pid_fmt;
+	int ret;
 
-	/* Ignore PE_CONTEXT packets that don't have a valid contextID */
-	if (!elem->context.ctxt_id_valid)
+	/*
+	 * As all the ETMs run at the same exception level, the system should
+	 * have the same PID format crossing CPUs.  So cache the PID format
+	 * and reuse it for sequential decoding.
+	 */
+	if (!pid_fmt) {
+		ret = cs_etm__get_pid_fmt(trace_chan_id, &pid_fmt);
+		if (ret)
+			return OCSD_RESP_FATAL_SYS_ERR;
+	}
+
+	/*
+	 * Process the PE_CONTEXT packets if we have a valid contextID or VMID.
+	 * If the kernel is running at EL2, the PID is traced in CONTEXTIDR_EL2
+	 * as VMID, Bit ETM_OPT_CTXTID2 is set in this case.
+	 */
+	switch (pid_fmt) {
+	case BIT(ETM_OPT_CTXTID):
+		if (elem->context.ctxt_id_valid)
+			tid = elem->context.context_id;
+		break;
+	case BIT(ETM_OPT_CTXTID2):
+		if (elem->context.vmid_valid)
+			tid = elem->context.vmid;
+		break;
+	default:
+		break;
+	}
+
+	if (tid == -1)
 		return OCSD_RESP_CONT;
 
-	tid =  elem->context.context_id;
 	if (cs_etm__etmq_set_tid(etmq, tid, trace_chan_id))
 		return OCSD_RESP_FATAL_SYS_ERR;
 
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c
index a2a369e2fbb6..7e63e7dedc33 100644
--- a/tools/perf/util/cs-etm.c
+++ b/tools/perf/util/cs-etm.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/bitops.h>
+#include <linux/coresight-pmu.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -156,11 +157,52 @@ int cs_etm__get_cpu(u8 trace_chan_id, int *cpu)
 	return 0;
 }
 
+/*
+ * The returned PID format is presented by two bits:
+ *
+ *   Bit ETM_OPT_CTXTID: CONTEXTIDR or CONTEXTIDR_EL1 is traced;
+ *   Bit ETM_OPT_CTXTID2: CONTEXTIDR_EL2 is traced.
+ *
+ * It's possible that the two bits ETM_OPT_CTXTID and ETM_OPT_CTXTID2
+ * are enabled at the same time when the session runs on an EL2 kernel.
+ * This means the CONTEXTIDR_EL1 and CONTEXTIDR_EL2 both will be
+ * recorded in the trace data, the tool will selectively use
+ * CONTEXTIDR_EL2 as PID.
+ */
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt)
+{
+	struct int_node *inode;
+	u64 *metadata, val;
+
+	inode = intlist__find(traceid_list, trace_chan_id);
+	if (!inode)
+		return -EINVAL;
+
+	metadata = inode->priv;
+
+	if (metadata[CS_ETM_MAGIC] == __perf_cs_etmv3_magic) {
+		val = metadata[CS_ETM_ETMCR];
+		/* CONTEXTIDR is traced */
+		if (val & BIT(ETM_OPT_CTXTID))
+			*pid_fmt = BIT(ETM_OPT_CTXTID);
+	} else {
+		val = metadata[CS_ETMV4_TRCCONFIGR];
+		/* CONTEXTIDR_EL2 is traced */
+		if (val & (BIT(ETM4_CFG_BIT_VMID) | BIT(ETM4_CFG_BIT_VMID_OPT)))
+			*pid_fmt = BIT(ETM_OPT_CTXTID2);
+		/* CONTEXTIDR_EL1 is traced */
+		else if (val & BIT(ETM4_CFG_BIT_CTXTID))
+			*pid_fmt = BIT(ETM_OPT_CTXTID);
+	}
+
+	return 0;
+}
+
 void cs_etm__etmq_set_traceid_queue_timestamp(struct cs_etm_queue *etmq,
 					      u8 trace_chan_id)
 {
 	/*
-	 * Wnen a timestamp packet is encountered the backend code
+	 * When a timestamp packet is encountered the backend code
 	 * is stopped so that the front end has time to process packets
 	 * that were accumulated in the traceID queue.  Since there can
 	 * be more than one channel per cs_etm_queue, we need to specify
@@ -1655,7 +1697,7 @@ static bool cs_etm__is_svc_instr(struct cs_etm_queue *etmq, u8 trace_chan_id,
 		 * | 1 1 0 1 1 1 1 1 |  imm8  |
 		 * +-----------------+--------+
 		 *
-		 * According to the specifiction, it only defines SVC for T32
+		 * According to the specification, it only defines SVC for T32
 		 * with 16 bits instruction and has no definition for 32bits;
 		 * so below only read 2 bytes as instruction size for T32.
 		 */
@@ -1887,7 +1929,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq,
 
 		/*
 		 * If the previous packet is an exception return packet
-		 * and the return address just follows SVC instuction,
+		 * and the return address just follows SVC instruction,
 		 * it needs to calibrate the previous packet sample flags
 		 * as PERF_IP_FLAG_SYSCALLRET.
 		 */
@@ -1961,7 +2003,7 @@ static int cs_etm__set_sample_flags(struct cs_etm_queue *etmq,
 		 * contain exception type related info so we cannot decide
 		 * the exception type purely based on exception return packet.
 		 * If we record the exception number from exception packet and
-		 * reuse it for excpetion return packet, this is not reliable
+		 * reuse it for exception return packet, this is not reliable
 		 * due the trace can be discontinuity or the interrupt can
 		 * be nested, thus the recorded exception number cannot be
 		 * used for exception return packet for these two cases.
@@ -2435,7 +2477,7 @@ static bool cs_etm__is_timeless_decoding(struct cs_etm_auxtrace *etm)
 }
 
 static const char * const cs_etm_global_header_fmts[] = {
-	[CS_HEADER_VERSION_0]	= "	Header version		       %llx\n",
+	[CS_HEADER_VERSION]	= "	Header version		       %llx\n",
 	[CS_PMU_TYPE_CPUS]	= "	PMU type/num cpus	       %llx\n",
 	[CS_ETM_SNAPSHOT]	= "	Snapshot		       %llx\n",
 };
@@ -2443,6 +2485,7 @@ static const char * const cs_etm_global_header_fmts[] = {
 static const char * const cs_etm_priv_fmts[] = {
 	[CS_ETM_MAGIC]		= "	Magic number		       %llx\n",
 	[CS_ETM_CPU]		= "	CPU			       %lld\n",
+	[CS_ETM_NR_TRC_PARAMS]	= "	NR_TRC_PARAMS		       %llx\n",
 	[CS_ETM_ETMCR]		= "	ETMCR			       %llx\n",
 	[CS_ETM_ETMTRACEIDR]	= "	ETMTRACEIDR		       %llx\n",
 	[CS_ETM_ETMCCER]	= "	ETMCCER			       %llx\n",
@@ -2452,6 +2495,7 @@ static const char * const cs_etm_priv_fmts[] = {
 static const char * const cs_etmv4_priv_fmts[] = {
 	[CS_ETM_MAGIC]		= "	Magic number		       %llx\n",
 	[CS_ETM_CPU]		= "	CPU			       %lld\n",
+	[CS_ETM_NR_TRC_PARAMS]	= "	NR_TRC_PARAMS		       %llx\n",
 	[CS_ETMV4_TRCCONFIGR]	= "	TRCCONFIGR		       %llx\n",
 	[CS_ETMV4_TRCTRACEIDR]	= "	TRCTRACEIDR		       %llx\n",
 	[CS_ETMV4_TRCIDR0]	= "	TRCIDR0			       %llx\n",
@@ -2461,26 +2505,167 @@ static const char * const cs_etmv4_priv_fmts[] = {
 	[CS_ETMV4_TRCAUTHSTATUS] = "	TRCAUTHSTATUS		       %llx\n",
 };
 
-static void cs_etm__print_auxtrace_info(__u64 *val, int num)
+static const char * const param_unk_fmt =
+	"	Unknown parameter [%d]	       %llx\n";
+static const char * const magic_unk_fmt =
+	"	Magic number Unknown	       %llx\n";
+
+static int cs_etm__print_cpu_metadata_v0(__u64 *val, int *offset)
 {
-	int i, j, cpu = 0;
+	int i = *offset, j, nr_params = 0, fmt_offset;
+	__u64 magic;
 
-	for (i = 0; i < CS_HEADER_VERSION_0_MAX; i++)
-		fprintf(stdout, cs_etm_global_header_fmts[i], val[i]);
+	/* check magic value */
+	magic = val[i + CS_ETM_MAGIC];
+	if ((magic != __perf_cs_etmv3_magic) &&
+	    (magic != __perf_cs_etmv4_magic)) {
+		/* failure - note bad magic value */
+		fprintf(stdout, magic_unk_fmt, magic);
+		return -EINVAL;
+	}
+
+	/* print common header block */
+	fprintf(stdout, cs_etm_priv_fmts[CS_ETM_MAGIC], val[i++]);
+	fprintf(stdout, cs_etm_priv_fmts[CS_ETM_CPU], val[i++]);
+
+	if (magic == __perf_cs_etmv3_magic) {
+		nr_params = CS_ETM_NR_TRC_PARAMS_V0;
+		fmt_offset = CS_ETM_ETMCR;
+		/* after common block, offset format index past NR_PARAMS */
+		for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
+			fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
+	} else if (magic == __perf_cs_etmv4_magic) {
+		nr_params = CS_ETMV4_NR_TRC_PARAMS_V0;
+		fmt_offset = CS_ETMV4_TRCCONFIGR;
+		/* after common block, offset format index past NR_PARAMS */
+		for (j = fmt_offset; j < nr_params + fmt_offset; j++, i++)
+			fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
+	}
+	*offset = i;
+	return 0;
+}
+
+static int cs_etm__print_cpu_metadata_v1(__u64 *val, int *offset)
+{
+	int i = *offset, j, total_params = 0;
+	__u64 magic;
+
+	magic = val[i + CS_ETM_MAGIC];
+	/* total params to print is NR_PARAMS + common block size for v1 */
+	total_params = val[i + CS_ETM_NR_TRC_PARAMS] + CS_ETM_COMMON_BLK_MAX_V1;
 
-	for (i = CS_HEADER_VERSION_0_MAX; cpu < num; cpu++) {
-		if (val[i] == __perf_cs_etmv3_magic)
-			for (j = 0; j < CS_ETM_PRIV_MAX; j++, i++)
+	if (magic == __perf_cs_etmv3_magic) {
+		for (j = 0; j < total_params; j++, i++) {
+			/* if newer record - could be excess params */
+			if (j >= CS_ETM_PRIV_MAX)
+				fprintf(stdout, param_unk_fmt, j, val[i]);
+			else
 				fprintf(stdout, cs_etm_priv_fmts[j], val[i]);
-		else if (val[i] == __perf_cs_etmv4_magic)
-			for (j = 0; j < CS_ETMV4_PRIV_MAX; j++, i++)
+		}
+	} else if (magic == __perf_cs_etmv4_magic) {
+		for (j = 0; j < total_params; j++, i++) {
+			/* if newer record - could be excess params */
+			if (j >= CS_ETMV4_PRIV_MAX)
+				fprintf(stdout, param_unk_fmt, j, val[i]);
+			else
 				fprintf(stdout, cs_etmv4_priv_fmts[j], val[i]);
-		else
-			/* failure.. return */
+		}
+	} else {
+		/* failure - note bad magic value and error out */
+		fprintf(stdout, magic_unk_fmt, magic);
+		return -EINVAL;
+	}
+	*offset = i;
+	return 0;
+}
+
+static void cs_etm__print_auxtrace_info(__u64 *val, int num)
+{
+	int i, cpu = 0, version, err;
+
+	/* bail out early on bad header version */
+	version = val[0];
+	if (version > CS_HEADER_CURRENT_VERSION) {
+		/* failure.. return */
+		fprintf(stdout, "	Unknown Header Version = %x, ", version);
+		fprintf(stdout, "Version supported <= %x\n", CS_HEADER_CURRENT_VERSION);
+		return;
+	}
+
+	for (i = 0; i < CS_HEADER_VERSION_MAX; i++)
+		fprintf(stdout, cs_etm_global_header_fmts[i], val[i]);
+
+	for (i = CS_HEADER_VERSION_MAX; cpu < num; cpu++) {
+		if (version == 0)
+			err = cs_etm__print_cpu_metadata_v0(val, &i);
+		else if (version == 1)
+			err = cs_etm__print_cpu_metadata_v1(val, &i);
+		if (err)
 			return;
 	}
 }
 
+/*
+ * Read a single cpu parameter block from the auxtrace_info priv block.
+ *
+ * For version 1 there is a per cpu nr_params entry. If we are handling
+ * version 1 file, then there may be less, the same, or more params
+ * indicated by this value than the compile time number we understand.
+ *
+ * For a version 0 info block, there are a fixed number, and we need to
+ * fill out the nr_param value in the metadata we create.
+ */
+static u64 *cs_etm__create_meta_blk(u64 *buff_in, int *buff_in_offset,
+				    int out_blk_size, int nr_params_v0)
+{
+	u64 *metadata = NULL;
+	int hdr_version;
+	int nr_in_params, nr_out_params, nr_cmn_params;
+	int i, k;
+
+	metadata = zalloc(sizeof(*metadata) * out_blk_size);
+	if (!metadata)
+		return NULL;
+
+	/* read block current index & version */
+	i = *buff_in_offset;
+	hdr_version = buff_in[CS_HEADER_VERSION];
+
+	if (!hdr_version) {
+	/* read version 0 info block into a version 1 metadata block  */
+		nr_in_params = nr_params_v0;
+		metadata[CS_ETM_MAGIC] = buff_in[i + CS_ETM_MAGIC];
+		metadata[CS_ETM_CPU] = buff_in[i + CS_ETM_CPU];
+		metadata[CS_ETM_NR_TRC_PARAMS] = nr_in_params;
+		/* remaining block params at offset +1 from source */
+		for (k = CS_ETM_COMMON_BLK_MAX_V1 - 1; k < nr_in_params; k++)
+			metadata[k + 1] = buff_in[i + k];
+		/* version 0 has 2 common params */
+		nr_cmn_params = 2;
+	} else {
+	/* read version 1 info block - input and output nr_params may differ */
+		/* version 1 has 3 common params */
+		nr_cmn_params = 3;
+		nr_in_params = buff_in[i + CS_ETM_NR_TRC_PARAMS];
+
+		/* if input has more params than output - skip excess */
+		nr_out_params = nr_in_params + nr_cmn_params;
+		if (nr_out_params > out_blk_size)
+			nr_out_params = out_blk_size;
+
+		for (k = CS_ETM_MAGIC; k < nr_out_params; k++)
+			metadata[k] = buff_in[i + k];
+
+		/* record the actual nr params we copied */
+		metadata[CS_ETM_NR_TRC_PARAMS] = nr_out_params - nr_cmn_params;
+	}
+
+	/* adjust in offset by number of in params used */
+	i += nr_in_params + nr_cmn_params;
+	*buff_in_offset = i;
+	return metadata;
+}
+
 int cs_etm__process_auxtrace_info(union perf_event *event,
 				  struct perf_session *session)
 {
@@ -2492,11 +2677,12 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	int info_header_size;
 	int total_size = auxtrace_info->header.size;
 	int priv_size = 0;
-	int num_cpu;
-	int err = 0, idx = -1;
-	int i, j, k;
+	int num_cpu, trcidr_idx;
+	int err = 0;
+	int i, j;
 	u64 *ptr, *hdr = NULL;
 	u64 **metadata = NULL;
+	u64 hdr_version;
 
 	/*
 	 * sizeof(auxtrace_info_event::type) +
@@ -2512,16 +2698,21 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	/* First the global part */
 	ptr = (u64 *) auxtrace_info->priv;
 
-	/* Look for version '0' of the header */
-	if (ptr[0] != 0)
+	/* Look for version of the header */
+	hdr_version = ptr[0];
+	if (hdr_version > CS_HEADER_CURRENT_VERSION) {
+		/* print routine will print an error on bad version */
+		if (dump_trace)
+			cs_etm__print_auxtrace_info(auxtrace_info->priv, 0);
 		return -EINVAL;
+	}
 
-	hdr = zalloc(sizeof(*hdr) * CS_HEADER_VERSION_0_MAX);
+	hdr = zalloc(sizeof(*hdr) * CS_HEADER_VERSION_MAX);
 	if (!hdr)
 		return -ENOMEM;
 
 	/* Extract header information - see cs-etm.h for format */
-	for (i = 0; i < CS_HEADER_VERSION_0_MAX; i++)
+	for (i = 0; i < CS_HEADER_VERSION_MAX; i++)
 		hdr[i] = ptr[i];
 	num_cpu = hdr[CS_PMU_TYPE_CPUS] & 0xffffffff;
 	pmu_type = (unsigned int) ((hdr[CS_PMU_TYPE_CPUS] >> 32) &
@@ -2552,35 +2743,31 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	 */
 	for (j = 0; j < num_cpu; j++) {
 		if (ptr[i] == __perf_cs_etmv3_magic) {
-			metadata[j] = zalloc(sizeof(*metadata[j]) *
-					     CS_ETM_PRIV_MAX);
-			if (!metadata[j]) {
-				err = -ENOMEM;
-				goto err_free_metadata;
-			}
-			for (k = 0; k < CS_ETM_PRIV_MAX; k++)
-				metadata[j][k] = ptr[i + k];
+			metadata[j] =
+				cs_etm__create_meta_blk(ptr, &i,
+							CS_ETM_PRIV_MAX,
+							CS_ETM_NR_TRC_PARAMS_V0);
 
 			/* The traceID is our handle */
-			idx = metadata[j][CS_ETM_ETMTRACEIDR];
-			i += CS_ETM_PRIV_MAX;
+			trcidr_idx = CS_ETM_ETMTRACEIDR;
+
 		} else if (ptr[i] == __perf_cs_etmv4_magic) {
-			metadata[j] = zalloc(sizeof(*metadata[j]) *
-					     CS_ETMV4_PRIV_MAX);
-			if (!metadata[j]) {
-				err = -ENOMEM;
-				goto err_free_metadata;
-			}
-			for (k = 0; k < CS_ETMV4_PRIV_MAX; k++)
-				metadata[j][k] = ptr[i + k];
+			metadata[j] =
+				cs_etm__create_meta_blk(ptr, &i,
+							CS_ETMV4_PRIV_MAX,
+							CS_ETMV4_NR_TRC_PARAMS_V0);
 
 			/* The traceID is our handle */
-			idx = metadata[j][CS_ETMV4_TRCTRACEIDR];
-			i += CS_ETMV4_PRIV_MAX;
+			trcidr_idx = CS_ETMV4_TRCTRACEIDR;
+		}
+
+		if (!metadata[j]) {
+			err = -ENOMEM;
+			goto err_free_metadata;
 		}
 
 		/* Get an RB node for this CPU */
-		inode = intlist__findnew(traceid_list, idx);
+		inode = intlist__findnew(traceid_list, metadata[j][trcidr_idx]);
 
 		/* Something went wrong, no need to continue */
 		if (!inode) {
@@ -2601,7 +2788,7 @@ int cs_etm__process_auxtrace_info(union perf_event *event,
 	}
 
 	/*
-	 * Each of CS_HEADER_VERSION_0_MAX, CS_ETM_PRIV_MAX and
+	 * Each of CS_HEADER_VERSION_MAX, CS_ETM_PRIV_MAX and
 	 * CS_ETMV4_PRIV_MAX mark how many double words are in the
 	 * global metadata, and each cpu's metadata respectively.
 	 * The following tests if the correct number of double words was
@@ -2703,6 +2890,12 @@ err_free_traceid_list:
 	intlist__delete(traceid_list);
 err_free_hdr:
 	zfree(&hdr);
-
+	/*
+	 * At this point, as a minimum we have valid header. Dump the rest of
+	 * the info section - the print routines will error out on structural
+	 * issues.
+	 */
+	if (dump_trace)
+		cs_etm__print_auxtrace_info(auxtrace_info->priv, num_cpu);
 	return err;
 }
diff --git a/tools/perf/util/cs-etm.h b/tools/perf/util/cs-etm.h
index 4ad925d6d799..36428918411e 100644
--- a/tools/perf/util/cs-etm.h
+++ b/tools/perf/util/cs-etm.h
@@ -12,28 +12,43 @@
 
 struct perf_session;
 
-/* Versionning header in case things need tro change in the future.  That way
+/*
+ * Versioning header in case things need to change in the future.  That way
  * decoding of old snapshot is still possible.
  */
 enum {
 	/* Starting with 0x0 */
-	CS_HEADER_VERSION_0,
+	CS_HEADER_VERSION,
 	/* PMU->type (32 bit), total # of CPUs (32 bit) */
 	CS_PMU_TYPE_CPUS,
 	CS_ETM_SNAPSHOT,
-	CS_HEADER_VERSION_0_MAX,
+	CS_HEADER_VERSION_MAX,
 };
 
+/*
+ * Update the version for new format.
+ *
+ * New version 1 format adds a param count to the per cpu metadata.
+ * This allows easy adding of new metadata parameters.
+ * Requires that new params always added after current ones.
+ * Also allows client reader to handle file versions that are different by
+ * checking the number of params in the file vs the number expected.
+ */
+#define CS_HEADER_CURRENT_VERSION 1
+
 /* Beginning of header common to both ETMv3 and V4 */
 enum {
 	CS_ETM_MAGIC,
 	CS_ETM_CPU,
+	/* Number of trace config params in following ETM specific block */
+	CS_ETM_NR_TRC_PARAMS,
+	CS_ETM_COMMON_BLK_MAX_V1,
 };
 
 /* ETMv3/PTM metadata */
 enum {
 	/* Dynamic, configurable parameters */
-	CS_ETM_ETMCR = CS_ETM_CPU + 1,
+	CS_ETM_ETMCR = CS_ETM_COMMON_BLK_MAX_V1,
 	CS_ETM_ETMTRACEIDR,
 	/* RO, taken from sysFS */
 	CS_ETM_ETMCCER,
@@ -41,10 +56,13 @@ enum {
 	CS_ETM_PRIV_MAX,
 };
 
+/* define fixed version 0 length - allow new format reader to read old files. */
+#define CS_ETM_NR_TRC_PARAMS_V0 (CS_ETM_ETMIDR - CS_ETM_ETMCR + 1)
+
 /* ETMv4 metadata */
 enum {
 	/* Dynamic, configurable parameters */
-	CS_ETMV4_TRCCONFIGR = CS_ETM_CPU + 1,
+	CS_ETMV4_TRCCONFIGR = CS_ETM_COMMON_BLK_MAX_V1,
 	CS_ETMV4_TRCTRACEIDR,
 	/* RO, taken from sysFS */
 	CS_ETMV4_TRCIDR0,
@@ -55,9 +73,12 @@ enum {
 	CS_ETMV4_PRIV_MAX,
 };
 
+/* define fixed version 0 length - allow new format reader to read old files. */
+#define CS_ETMV4_NR_TRC_PARAMS_V0 (CS_ETMV4_TRCAUTHSTATUS - CS_ETMV4_TRCCONFIGR + 1)
+
 /*
  * ETMv3 exception encoding number:
- * See Embedded Trace Macrocell spcification (ARM IHI 0014Q)
+ * See Embedded Trace Macrocell specification (ARM IHI 0014Q)
  * table 7-12 Encoding of Exception[3:0] for non-ARMv7-M processors.
  */
 enum {
@@ -162,7 +183,7 @@ struct cs_etm_packet_queue {
 
 #define BMVAL(val, lsb, msb)	((val & GENMASK(msb, lsb)) >> lsb)
 
-#define CS_ETM_HEADER_SIZE (CS_HEADER_VERSION_0_MAX * sizeof(u64))
+#define CS_ETM_HEADER_SIZE (CS_HEADER_VERSION_MAX * sizeof(u64))
 
 #define __perf_cs_etmv3_magic 0x3030303030303030ULL
 #define __perf_cs_etmv4_magic 0x4040404040404040ULL
@@ -173,6 +194,7 @@ struct cs_etm_packet_queue {
 int cs_etm__process_auxtrace_info(union perf_event *event,
 				  struct perf_session *session);
 int cs_etm__get_cpu(u8 trace_chan_id, int *cpu);
+int cs_etm__get_pid_fmt(u8 trace_chan_id, u64 *pid_fmt);
 int cs_etm__etmq_set_tid(struct cs_etm_queue *etmq,
 			 pid_t tid, u8 trace_chan_id);
 bool cs_etm__etmq_is_timeless(struct cs_etm_queue *etmq);
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 8b67bd97d122..cace349fb700 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -21,7 +21,7 @@
 #include <babeltrace/ctf/events.h>
 #include <traceevent/event-parse.h>
 #include "asm/bug.h"
-#include "data-convert-bt.h"
+#include "data-convert.h"
 #include "session.h"
 #include "debug.h"
 #include "tool.h"
@@ -949,7 +949,7 @@ static char *change_name(char *name, char *orig_name, int dup)
 	/*
 	 * Add '_' prefix to potential keywork.  According to
 	 * Mathieu Desnoyers (https://lore.kernel.org/lkml/1074266107.40857.1422045946295.JavaMail.zimbra@efficios.com),
-	 * futher CTF spec updating may require us to use '$'.
+	 * further CTF spec updating may require us to use '$'.
 	 */
 	if (dup < 0)
 		len = strlen(name) + sizeof("_");
diff --git a/tools/perf/util/data-convert-bt.h b/tools/perf/util/data-convert-bt.h
deleted file mode 100644
index 821674d63c4e..000000000000
--- a/tools/perf/util/data-convert-bt.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __DATA_CONVERT_BT_H
-#define __DATA_CONVERT_BT_H
-#include "data-convert.h"
-#ifdef HAVE_LIBBABELTRACE_SUPPORT
-
-int bt_convert__perf2ctf(const char *input_name, const char *to_ctf,
-			 struct perf_data_convert_opts *opts);
-
-#endif /* HAVE_LIBBABELTRACE_SUPPORT */
-#endif /* __DATA_CONVERT_BT_H */
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
new file mode 100644
index 000000000000..355cd1948bdf
--- /dev/null
+++ b/tools/perf/util/data-convert-json.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * JSON export.
+ *
+ * Copyright (C) 2021, CodeWeavers Inc. <nfraser@codeweavers.com>
+ */
+
+#include "data-convert.h"
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "linux/compiler.h"
+#include "linux/err.h"
+#include "util/auxtrace.h"
+#include "util/debug.h"
+#include "util/dso.h"
+#include "util/event.h"
+#include "util/evsel.h"
+#include "util/evlist.h"
+#include "util/header.h"
+#include "util/map.h"
+#include "util/session.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/tool.h"
+
+struct convert_json {
+	struct perf_tool tool;
+	FILE *out;
+	bool first;
+	u64 events_count;
+};
+
+// Outputs a JSON-encoded string surrounded by quotes with characters escaped.
+static void output_json_string(FILE *out, const char *s)
+{
+	fputc('"', out);
+	while (*s) {
+		switch (*s) {
+
+		// required escapes with special forms as per RFC 8259
+		case '"':  fputs("\\\"", out); break;
+		case '\\': fputs("\\\\", out); break;
+		case '\b': fputs("\\b", out);  break;
+		case '\f': fputs("\\f", out);  break;
+		case '\n': fputs("\\n", out);  break;
+		case '\r': fputs("\\r", out);  break;
+		case '\t': fputs("\\t", out);  break;
+
+		default:
+			// all other control characters must be escaped by hex code
+			if (*s <= 0x1f)
+				fprintf(out, "\\u%04x", *s);
+			else
+				fputc(*s, out);
+			break;
+		}
+
+		++s;
+	}
+	fputc('"', out);
+}
+
+// Outputs an optional comma, newline and indentation to delimit a new value
+// from the previous one in a JSON object or array.
+static void output_json_delimiters(FILE *out, bool comma, int depth)
+{
+	int i;
+
+	if (comma)
+		fputc(',', out);
+	fputc('\n', out);
+	for (i = 0; i < depth; ++i)
+		fputc('\t', out);
+}
+
+// Outputs a printf format string (with delimiter) as a JSON value.
+__printf(4, 5)
+static void output_json_format(FILE *out, bool comma, int depth, const char *format, ...)
+{
+	va_list args;
+
+	output_json_delimiters(out, comma, depth);
+	va_start(args, format);
+	vfprintf(out,  format, args);
+	va_end(args);
+}
+
+// Outputs a JSON key-value pair where the value is a string.
+static void output_json_key_string(FILE *out, bool comma, int depth,
+		const char *key, const char *value)
+{
+	output_json_delimiters(out, comma, depth);
+	output_json_string(out, key);
+	fputs(": ", out);
+	output_json_string(out, value);
+}
+
+// Outputs a JSON key-value pair where the value is a printf format string.
+__printf(5, 6)
+static void output_json_key_format(FILE *out, bool comma, int depth,
+		const char *key, const char *format, ...)
+{
+	va_list args;
+
+	output_json_delimiters(out, comma, depth);
+	output_json_string(out, key);
+	fputs(": ", out);
+	va_start(args, format);
+	vfprintf(out,  format, args);
+	va_end(args);
+}
+
+static void output_sample_callchain_entry(struct perf_tool *tool,
+		u64 ip, struct addr_location *al)
+{
+	struct convert_json *c = container_of(tool, struct convert_json, tool);
+	FILE *out = c->out;
+
+	output_json_format(out, false, 4, "{");
+	output_json_key_format(out, false, 5, "ip", "\"0x%" PRIx64 "\"", ip);
+
+	if (al && al->sym && al->sym->namelen) {
+		fputc(',', out);
+		output_json_key_string(out, false, 5, "symbol", al->sym->name);
+
+		if (al->map && al->map->dso) {
+			const char *dso = al->map->dso->short_name;
+
+			if (dso && strlen(dso) > 0) {
+				fputc(',', out);
+				output_json_key_string(out, false, 5, "dso", dso);
+			}
+		}
+	}
+
+	output_json_format(out, false, 4, "}");
+}
+
+static int process_sample_event(struct perf_tool *tool,
+				union perf_event *event __maybe_unused,
+				struct perf_sample *sample,
+				struct evsel *evsel __maybe_unused,
+				struct machine *machine)
+{
+	struct convert_json *c = container_of(tool, struct convert_json, tool);
+	FILE *out = c->out;
+	struct addr_location al, tal;
+	u8 cpumode = PERF_RECORD_MISC_USER;
+
+	if (machine__resolve(machine, &al, sample) < 0) {
+		pr_err("Sample resolution failed!\n");
+		return -1;
+	}
+
+	++c->events_count;
+
+	if (c->first)
+		c->first = false;
+	else
+		fputc(',', out);
+	output_json_format(out, false, 2, "{");
+
+	output_json_key_format(out, false, 3, "timestamp", "%" PRIi64, sample->time);
+	output_json_key_format(out, true, 3, "pid", "%i", al.thread->pid_);
+	output_json_key_format(out, true, 3, "tid", "%i", al.thread->tid);
+
+	if (al.thread->cpu >= 0)
+		output_json_key_format(out, true, 3, "cpu", "%i", al.thread->cpu);
+
+	output_json_key_string(out, true, 3, "comm", thread__comm_str(al.thread));
+
+	output_json_key_format(out, true, 3, "callchain", "[");
+	if (sample->callchain) {
+		unsigned int i;
+		bool ok;
+		bool first_callchain = true;
+
+		for (i = 0; i < sample->callchain->nr; ++i) {
+			u64 ip = sample->callchain->ips[i];
+
+			if (ip >= PERF_CONTEXT_MAX) {
+				switch (ip) {
+				case PERF_CONTEXT_HV:
+					cpumode = PERF_RECORD_MISC_HYPERVISOR;
+					break;
+				case PERF_CONTEXT_KERNEL:
+					cpumode = PERF_RECORD_MISC_KERNEL;
+					break;
+				case PERF_CONTEXT_USER:
+					cpumode = PERF_RECORD_MISC_USER;
+					break;
+				default:
+					pr_debug("invalid callchain context: %"
+							PRId64 "\n", (s64) ip);
+					break;
+				}
+				continue;
+			}
+
+			if (first_callchain)
+				first_callchain = false;
+			else
+				fputc(',', out);
+
+			ok = thread__find_symbol(al.thread, cpumode, ip, &tal);
+			output_sample_callchain_entry(tool, ip, ok ? &tal : NULL);
+		}
+	} else {
+		output_sample_callchain_entry(tool, sample->ip, &al);
+	}
+	output_json_format(out, false, 3, "]");
+
+	output_json_format(out, false, 2, "}");
+	return 0;
+}
+
+static void output_headers(struct perf_session *session, struct convert_json *c)
+{
+	struct stat st;
+	struct perf_header *header = &session->header;
+	int ret;
+	int fd = perf_data__fd(session->data);
+	int i;
+	FILE *out = c->out;
+
+	output_json_key_format(out, false, 2, "header-version", "%u", header->version);
+
+	ret = fstat(fd, &st);
+	if (ret >= 0) {
+		time_t stctime = st.st_mtime;
+		char buf[256];
+
+		strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&stctime));
+		output_json_key_string(out, true, 2, "captured-on", buf);
+	} else {
+		pr_debug("Failed to get mtime of source file, not writing captured-on");
+	}
+
+	output_json_key_format(out, true, 2, "data-offset", "%" PRIu64, header->data_offset);
+	output_json_key_format(out, true, 2, "data-size", "%" PRIu64, header->data_size);
+	output_json_key_format(out, true, 2, "feat-offset", "%" PRIu64, header->feat_offset);
+
+	output_json_key_string(out, true, 2, "hostname", header->env.hostname);
+	output_json_key_string(out, true, 2, "os-release", header->env.os_release);
+	output_json_key_string(out, true, 2, "arch", header->env.arch);
+
+	output_json_key_string(out, true, 2, "cpu-desc", header->env.cpu_desc);
+	output_json_key_string(out, true, 2, "cpuid", header->env.cpuid);
+	output_json_key_format(out, true, 2, "nrcpus-online", "%u", header->env.nr_cpus_online);
+	output_json_key_format(out, true, 2, "nrcpus-avail", "%u", header->env.nr_cpus_avail);
+
+	if (header->env.clock.enabled) {
+		output_json_key_format(out, true, 2, "clockid",
+				"%u", header->env.clock.clockid);
+		output_json_key_format(out, true, 2, "clock-time",
+				"%" PRIu64, header->env.clock.clockid_ns);
+		output_json_key_format(out, true, 2, "real-time",
+				"%" PRIu64, header->env.clock.tod_ns);
+	}
+
+	output_json_key_string(out, true, 2, "perf-version", header->env.version);
+
+	output_json_key_format(out, true, 2, "cmdline", "[");
+	for (i = 0; i < header->env.nr_cmdline; i++) {
+		output_json_delimiters(out, i != 0, 3);
+		output_json_string(c->out, header->env.cmdline_argv[i]);
+	}
+	output_json_format(out, false, 2, "]");
+}
+
+int bt_convert__perf2json(const char *input_name, const char *output_name,
+		struct perf_data_convert_opts *opts __maybe_unused)
+{
+	struct perf_session *session;
+	int fd;
+	int ret = -1;
+
+	struct convert_json c = {
+		.tool = {
+			.sample         = process_sample_event,
+			.mmap           = perf_event__process_mmap,
+			.mmap2          = perf_event__process_mmap2,
+			.comm           = perf_event__process_comm,
+			.namespaces     = perf_event__process_namespaces,
+			.cgroup         = perf_event__process_cgroup,
+			.exit           = perf_event__process_exit,
+			.fork           = perf_event__process_fork,
+			.lost           = perf_event__process_lost,
+			.tracing_data   = perf_event__process_tracing_data,
+			.build_id       = perf_event__process_build_id,
+			.id_index       = perf_event__process_id_index,
+			.auxtrace_info  = perf_event__process_auxtrace_info,
+			.auxtrace       = perf_event__process_auxtrace,
+			.event_update   = perf_event__process_event_update,
+			.ordered_events = true,
+			.ordering_requires_timestamps = true,
+		},
+		.first = true,
+		.events_count = 0,
+	};
+
+	struct perf_data data = {
+		.mode = PERF_DATA_MODE_READ,
+		.path = input_name,
+		.force = opts->force,
+	};
+
+	if (opts->all) {
+		pr_err("--all is currently unsupported for JSON output.\n");
+		goto err;
+	}
+	if (opts->tod) {
+		pr_err("--tod is currently unsupported for JSON output.\n");
+		goto err;
+	}
+
+	fd = open(output_name, O_CREAT | O_WRONLY | (opts->force ? O_TRUNC : O_EXCL), 0666);
+	if (fd == -1) {
+		if (errno == EEXIST)
+			pr_err("Output file exists. Use --force to overwrite it.\n");
+		else
+			pr_err("Error opening output file!\n");
+		goto err;
+	}
+
+	c.out = fdopen(fd, "w");
+	if (!c.out) {
+		fprintf(stderr, "Error opening output file!\n");
+		close(fd);
+		goto err;
+	}
+
+	session = perf_session__new(&data, false, &c.tool);
+	if (IS_ERR(session)) {
+		fprintf(stderr, "Error creating perf session!\n");
+		goto err_fclose;
+	}
+
+	if (symbol__init(&session->header.env) < 0) {
+		fprintf(stderr, "Symbol init error!\n");
+		goto err_session_delete;
+	}
+
+	// The opening brace is printed manually because it isn't delimited from a
+	// previous value (i.e. we don't want a leading newline)
+	fputc('{', c.out);
+
+	// Version number for future-proofing. Most additions should be able to be
+	// done in a backwards-compatible way so this should only need to be bumped
+	// if some major breaking change must be made.
+	output_json_format(c.out, false, 1, "\"linux-perf-json-version\": 1");
+
+	// Output headers
+	output_json_format(c.out, true, 1, "\"headers\": {");
+	output_headers(session, &c);
+	output_json_format(c.out, false, 1, "}");
+
+	// Output samples
+	output_json_format(c.out, true, 1, "\"samples\": [");
+	perf_session__process_events(session);
+	output_json_format(c.out, false, 1, "]");
+	output_json_format(c.out, false, 0, "}");
+	fputc('\n', c.out);
+
+	fprintf(stderr,
+			"[ perf data convert: Converted '%s' into JSON data '%s' ]\n",
+			data.path, output_name);
+
+	fprintf(stderr,
+			"[ perf data convert: Converted and wrote %.3f MB (%" PRIu64 " samples) ]\n",
+			(ftell(c.out)) / 1024.0 / 1024.0, c.events_count);
+
+	ret = 0;
+err_session_delete:
+	perf_session__delete(session);
+err_fclose:
+	fclose(c.out);
+err:
+	return ret;
+}
diff --git a/tools/perf/util/data-convert.h b/tools/perf/util/data-convert.h
index feab5f114e37..1b4c5f598415 100644
--- a/tools/perf/util/data-convert.h
+++ b/tools/perf/util/data-convert.h
@@ -2,10 +2,20 @@
 #ifndef __DATA_CONVERT_H
 #define __DATA_CONVERT_H
 
+#include <stdbool.h>
+
 struct perf_data_convert_opts {
 	bool force;
 	bool all;
 	bool tod;
 };
 
+#ifdef HAVE_LIBBABELTRACE_SUPPORT
+int bt_convert__perf2ctf(const char *input_name, const char *to_ctf,
+			 struct perf_data_convert_opts *opts);
+#endif /* HAVE_LIBBABELTRACE_SUPPORT */
+
+int bt_convert__perf2json(const char *input_name, const char *to_ctf,
+			 struct perf_data_convert_opts *opts);
+
 #endif /* __DATA_CONVERT_H */
diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c
index 39c05200ed65..ddf33d58bcd3 100644
--- a/tools/perf/util/demangle-java.c
+++ b/tools/perf/util/demangle-java.c
@@ -147,7 +147,7 @@ error:
  * Demangle Java function signature (openJDK, not GCJ)
  * input:
  * 	str: string to parse. String is not modified
- *    flags: comobination of JAVA_DEMANGLE_* flags to modify demangling
+ *    flags: combination of JAVA_DEMANGLE_* flags to modify demangling
  * return:
  *	if input can be demangled, then a newly allocated string is returned.
  *	if input cannot be demangled, then NULL is returned
@@ -164,7 +164,7 @@ java_demangle_sym(const char *str, int flags)
 	if (!str)
 		return NULL;
 
-	/* find start of retunr type */
+	/* find start of return type */
 	p = strrchr(str, ')');
 	if (!p)
 		return NULL;
diff --git a/tools/perf/util/demangle-ocaml.c b/tools/perf/util/demangle-ocaml.c
index 3df14e67c622..9d707bb60b4b 100644
--- a/tools/perf/util/demangle-ocaml.c
+++ b/tools/perf/util/demangle-ocaml.c
@@ -64,17 +64,5 @@ ocaml_demangle_sym(const char *sym)
 	}
 	result[j] = '\0';
 
-	/* scan backwards to remove an "_" followed by decimal digits */
-	if (j != 0 && isdigit(result[j - 1])) {
-		while (--j) {
-			if (!isdigit(result[j])) {
-				break;
-			}
-		}
-		if (result[j] == '_') {
-			result[j] = '\0';
-		}
-	}
-
 	return result;
 }
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index cd2fe64a3c5d..52e7101c5609 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -216,7 +216,7 @@ struct dso {
 
 /* dso__for_each_symbol - iterate over the symbols of given type
  *
- * @dso: the 'struct dso *' in which symbols itereated
+ * @dso: the 'struct dso *' in which symbols are iterated
  * @pos: the 'struct symbol *' to use as a loop cursor
  * @n: the 'struct rb_node *' to use as a temporary storage
  */
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 7b2d471a6419..b2f4920e19a6 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -91,7 +91,7 @@ static Dwarf_Line *cu_getsrc_die(Dwarf_Die *cu_die, Dwarf_Addr addr)
 			return NULL;
 	} while (laddr == addr);
 	l++;
-	/* Going foward to find the statement line */
+	/* Going forward to find the statement line */
 	do {
 		line = dwarf_onesrcline(lines, l++);
 		if (!line || dwarf_lineaddr(line, &laddr) != 0 ||
@@ -177,7 +177,7 @@ int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
  * die_get_linkage_name - Get the linkage name of the object
  * @dw_die: A DIE of the object
  *
- * Get the linkage name attiribute of given @dw_die.
+ * Get the linkage name attribute of given @dw_die.
  * For C++ binary, the linkage name will be the mangled symbol.
  */
 const char *die_get_linkage_name(Dwarf_Die *dw_die)
@@ -739,7 +739,7 @@ static int __die_walk_instances_cb(Dwarf_Die *inst, void *data)
  * @data: user data
  *
  * Walk on the instances of give @in_die. @in_die must be an inlined function
- * declartion. This returns the return value of @callback if it returns
+ * declaration. This returns the return value of @callback if it returns
  * non-zero value, or -ENOENT if there is no instance.
  */
 int die_walk_instances(Dwarf_Die *or_die, int (*callback)(Dwarf_Die *, void *),
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index 506006e0cf66..cb99646843a9 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -22,7 +22,7 @@ const char *cu_get_comp_dir(Dwarf_Die *cu_die);
 int cu_find_lineinfo(Dwarf_Die *cudie, unsigned long addr,
 		     const char **fname, int *lineno);
 
-/* Walk on funcitons at given address */
+/* Walk on functions at given address */
 int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
 			 int (*callback)(Dwarf_Die *, void *), void *data);
 
diff --git a/tools/perf/util/dwarf-regs.c b/tools/perf/util/dwarf-regs.c
index 1b49ecee5aff..3fa4486742cd 100644
--- a/tools/perf/util/dwarf-regs.c
+++ b/tools/perf/util/dwarf-regs.c
@@ -24,6 +24,7 @@
 #include "../arch/s390/include/dwarf-regs-table.h"
 #include "../arch/sparc/include/dwarf-regs-table.h"
 #include "../arch/xtensa/include/dwarf-regs-table.h"
+#include "../arch/mips/include/dwarf-regs-table.h"
 
 #define __get_dwarf_regstr(tbl, n) (((n) < ARRAY_SIZE(tbl)) ? (tbl)[(n)] : NULL)
 
@@ -53,6 +54,8 @@ const char *get_dwarf_regstr(unsigned int n, unsigned int machine)
 		return __get_dwarf_regstr(sparc_regstr_tbl, n);
 	case EM_XTENSA:
 		return __get_dwarf_regstr(xtensa_regstr_tbl, n);
+	case EM_MIPS:
+		return __get_dwarf_regstr(mips_regstr_tbl, n);
 	default:
 		pr_err("ELF MACHINE %x is not supported.\n", machine);
 	}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index f603edbbbc6f..8a62fb39e365 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -147,6 +147,7 @@ struct perf_sample {
 	u8  cpumode;
 	u16 misc;
 	u16 ins_lat;
+	u16 p_stage_cyc;
 	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
 	char insn[MAX_INSN];
 	void *raw_data;
@@ -427,5 +428,7 @@ char *get_page_size_name(u64 size, char *str);
 
 void arch_perf_parse_sample_weight(struct perf_sample *data, const __u64 *array, u64 type);
 void arch_perf_synthesize_sample_weight(const struct perf_sample *data, __u64 *array, u64 type);
+const char *arch_perf_header_entry(const char *se_header);
+int arch_support_sort_key(const char *sort_key);
 
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/events_stats.h b/tools/perf/util/events_stats.h
index 859cb34fcff2..3480bafd414b 100644
--- a/tools/perf/util/events_stats.h
+++ b/tools/perf/util/events_stats.h
@@ -21,20 +21,17 @@
  * all struct perf_record_lost_samples.lost fields reported.
  *
  * The total_period is needed because by default auto-freq is used, so
- * multipling nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get
+ * multiplying nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get
  * the total number of low level events, it is necessary to to sum all struct
  * perf_record_sample.period and stash the result in total_period.
  */
 struct events_stats {
-	u64 total_period;
-	u64 total_non_filtered_period;
 	u64 total_lost;
 	u64 total_lost_samples;
 	u64 total_aux_lost;
 	u64 total_aux_partial;
 	u64 total_invalid_chains;
 	u32 nr_events[PERF_RECORD_HEADER_MAX];
-	u32 nr_non_filtered_samples;
 	u32 nr_lost_warned;
 	u32 nr_unknown_events;
 	u32 nr_invalid_chains;
@@ -44,8 +41,16 @@ struct events_stats {
 	u32 nr_proc_map_timeout;
 };
 
+struct hists_stats {
+	u64 total_period;
+	u64 total_non_filtered_period;
+	u32 nr_samples;
+	u32 nr_non_filtered_samples;
+};
+
 void events_stats__inc(struct events_stats *stats, u32 type);
 
-size_t events_stats__fprintf(struct events_stats *stats, FILE *fp);
+size_t events_stats__fprintf(struct events_stats *stats, FILE *fp,
+			     bool skip_empty);
 
 #endif /* __PERF_EVENTS_STATS_ */
diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c
new file mode 100644
index 000000000000..db3f5fbdebe1
--- /dev/null
+++ b/tools/perf/util/evlist-hybrid.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <errno.h>
+#include <inttypes.h>
+#include "cpumap.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "../perf.h"
+#include "util/pmu-hybrid.h"
+#include "util/evlist-hybrid.h"
+#include "debug.h"
+#include <unistd.h>
+#include <stdlib.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <perf/evlist.h>
+#include <perf/evsel.h>
+#include <perf/cpumap.h>
+
+int evlist__add_default_hybrid(struct evlist *evlist, bool precise)
+{
+	struct evsel *evsel;
+	struct perf_pmu *pmu;
+	__u64 config;
+	struct perf_cpu_map *cpus;
+
+	perf_pmu__for_each_hybrid_pmu(pmu) {
+		config = PERF_COUNT_HW_CPU_CYCLES |
+			 ((__u64)pmu->type << PERF_PMU_TYPE_SHIFT);
+		evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE,
+					  config);
+		if (!evsel)
+			return -ENOMEM;
+
+		cpus = perf_cpu_map__get(pmu->cpus);
+		evsel->core.cpus = cpus;
+		evsel->core.own_cpus = perf_cpu_map__get(cpus);
+		evsel->pmu_name = strdup(pmu->name);
+		evlist__add(evlist, evsel);
+	}
+
+	return 0;
+}
+
+static bool group_hybrid_conflict(struct evsel *leader)
+{
+	struct evsel *pos, *prev = NULL;
+
+	for_each_group_evsel(pos, leader) {
+		if (!evsel__is_hybrid(pos))
+			continue;
+
+		if (prev && strcmp(prev->pmu_name, pos->pmu_name))
+			return true;
+
+		prev = pos;
+	}
+
+	return false;
+}
+
+void evlist__warn_hybrid_group(struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel__is_group_leader(evsel) &&
+		    evsel->core.nr_members > 1 &&
+		    group_hybrid_conflict(evsel)) {
+			pr_warning("WARNING: events in group from "
+				   "different hybrid PMUs!\n");
+			return;
+		}
+	}
+}
+
+bool evlist__has_hybrid(struct evlist *evlist)
+{
+	struct evsel *evsel;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel->pmu_name &&
+		    perf_pmu__is_hybrid(evsel->pmu_name)) {
+			return true;
+		}
+	}
+
+	return false;
+}
diff --git a/tools/perf/util/evlist-hybrid.h b/tools/perf/util/evlist-hybrid.h
new file mode 100644
index 000000000000..19f74b4c340a
--- /dev/null
+++ b/tools/perf/util/evlist-hybrid.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_EVLIST_HYBRID_H
+#define __PERF_EVLIST_HYBRID_H
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include "evlist.h"
+#include <unistd.h>
+
+int evlist__add_default_hybrid(struct evlist *evlist, bool precise);
+void evlist__warn_hybrid_group(struct evlist *evlist);
+bool evlist__has_hybrid(struct evlist *evlist);
+
+#endif /* __PERF_EVLIST_HYBRID_H */
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 882cd1f721d9..6e5c41528c7d 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -17,6 +17,7 @@
 #include "evsel.h"
 #include "debug.h"
 #include "units.h"
+#include "bpf_counter.h"
 #include <internal/lib.h> // page_size
 #include "affinity.h"
 #include "../perf.h"
@@ -25,6 +26,7 @@
 #include "util/string2.h"
 #include "util/perf_api_probe.h"
 #include "util/evsel_fprintf.h"
+#include "util/evlist-hybrid.h"
 #include <signal.h>
 #include <unistd.h>
 #include <sched.h>
@@ -36,6 +38,7 @@
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/prctl.h>
 
 #include <linux/bitops.h>
 #include <linux/hash.h>
@@ -246,8 +249,10 @@ void evlist__set_leader(struct evlist *evlist)
 
 int __evlist__add_default(struct evlist *evlist, bool precise)
 {
-	struct evsel *evsel = evsel__new_cycles(precise);
+	struct evsel *evsel;
 
+	evsel = evsel__new_cycles(precise, PERF_TYPE_HARDWARE,
+				  PERF_COUNT_HW_CPU_CYCLES);
 	if (evsel == NULL)
 		return -ENOMEM;
 
@@ -420,6 +425,9 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name)
 	if (affinity__setup(&affinity) < 0)
 		return;
 
+	evlist__for_each_entry(evlist, pos)
+		bpf_counter__disable(pos);
+
 	/* Disable 'immediate' events last */
 	for (imm = 0; imm <= 1; imm++) {
 		evlist__for_each_cpu(evlist, i, cpu) {
@@ -1209,7 +1217,7 @@ bool evlist__valid_read_format(struct evlist *evlist)
 		}
 	}
 
-	/* PERF_SAMPLE_READ imples PERF_FORMAT_ID. */
+	/* PERF_SAMPLE_READ implies PERF_FORMAT_ID. */
 	if ((sample_type & PERF_SAMPLE_READ) &&
 	    !(read_format & PERF_FORMAT_ID)) {
 		return false;
@@ -1406,6 +1414,13 @@ int evlist__prepare_workload(struct evlist *evlist, struct target *target, const
 		fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
 
 		/*
+		 * Change the name of this process not to confuse --exclude-perf users
+		 * that sees 'perf' in the window up to the execvp() and thinks that
+		 * perf samples are not being excluded.
+		 */
+		prctl(PR_SET_NAME, "perf-exec");
+
+		/*
 		 * Tell the parent we're ready to go
 		 */
 		close(child_ready_pipe[1]);
@@ -2130,3 +2145,22 @@ struct evsel *evlist__find_evsel(struct evlist *evlist, int idx)
 	}
 	return NULL;
 }
+
+int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf)
+{
+	struct evsel *evsel;
+	int printed = 0;
+
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel__is_dummy_event(evsel))
+			continue;
+		if (size > (strlen(evsel__name(evsel)) + (printed ? 2 : 1))) {
+			printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "," : "", evsel__name(evsel));
+		} else {
+			printed += scnprintf(bf + printed, size - printed, "%s...", printed ? "," : "");
+			break;
+		}
+	}
+
+	return printed;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index b695ffaae519..a8b97b50cceb 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -365,4 +365,6 @@ int evlist__ctlfd_ack(struct evlist *evlist);
 #define EVLIST_DISABLED_MSG "Events disabled\n"
 
 struct evsel *evlist__find_evsel(struct evlist *evlist, int idx);
+
+int evlist__scnprintf_evsels(struct evlist *evlist, size_t size, char *bf);
 #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 7ecbc8e2fbfa..4a3cd1b5bb33 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -47,6 +47,7 @@
 #include "memswap.h"
 #include "util.h"
 #include "hashmap.h"
+#include "pmu-hybrid.h"
 #include "../perf-sys.h"
 #include "util/parse-branch-options.h"
 #include <internal/xyarray.h>
@@ -295,11 +296,11 @@ static bool perf_event_can_profile_kernel(void)
 	return perf_event_paranoid_check(1);
 }
 
-struct evsel *evsel__new_cycles(bool precise)
+struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config)
 {
 	struct perf_event_attr attr = {
-		.type	= PERF_TYPE_HARDWARE,
-		.config	= PERF_COUNT_HW_CPU_CYCLES,
+		.type	= type,
+		.config	= config,
 		.exclude_kernel	= !perf_event_can_profile_kernel(),
 	};
 	struct evsel *evsel;
@@ -492,6 +493,28 @@ const char *evsel__hw_names[PERF_COUNT_HW_MAX] = {
 	"ref-cycles",
 };
 
+char *evsel__bpf_counter_events;
+
+bool evsel__match_bpf_counter_events(const char *name)
+{
+	int name_len;
+	bool match;
+	char *ptr;
+
+	if (!evsel__bpf_counter_events)
+		return false;
+
+	ptr = strstr(evsel__bpf_counter_events, name);
+	name_len = strlen(name);
+
+	/* check name matches a full token in evsel__bpf_counter_events */
+	match = (ptr != NULL) &&
+		((ptr == evsel__bpf_counter_events) || (*(ptr - 1) == ',')) &&
+		((*(ptr + name_len) == ',') || (*(ptr + name_len) == '\0'));
+
+	return match;
+}
+
 static const char *__evsel__hw_name(u64 config)
 {
 	if (config < PERF_COUNT_HW_MAX && evsel__hw_names[config])
@@ -621,7 +644,7 @@ const char *evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX][EVSEL__MAX_AL
 #define COP(x)		(1 << x)
 
 /*
- * cache operartion stat
+ * cache operation stat
  * L1I : Read and prefetch only
  * ITLB and BPU : Read-only
  */
@@ -2275,7 +2298,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 		/*
 		 * Undo swap of u64, then swap on individual u32s,
 		 * get the size of the raw area and undo all of the
-		 * swap. The pevent interface handles endianity by
+		 * swap. The pevent interface handles endianness by
 		 * itself.
 		 */
 		if (swapped) {
@@ -2797,3 +2820,8 @@ void evsel__zero_per_pkg(struct evsel *evsel)
 		hashmap__clear(evsel->per_pkg_mask);
 	}
 }
+
+bool evsel__is_hybrid(struct evsel *evsel)
+{
+	return evsel->pmu_name && perf_pmu__is_hybrid(evsel->pmu_name);
+}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 6026487353dd..75cf5dbfe208 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -20,6 +20,8 @@ union perf_event;
 struct bpf_counter_ops;
 struct target;
 struct hashmap;
+struct bperf_leader_bpf;
+struct bperf_follower_bpf;
 
 typedef int (evsel__sb_cb_t)(union perf_event *event, void *data);
 
@@ -80,6 +82,7 @@ struct evsel {
 		bool			auto_merge_stats;
 		bool			collect_stat;
 		bool			weak_group;
+		bool			bpf_counter;
 		int			bpf_fd;
 		struct bpf_object	*bpf_obj;
 	};
@@ -113,6 +116,7 @@ struct evsel {
 	bool			merged_stat;
 	bool			reset_group;
 	bool			errored;
+	bool			use_config_name;
 	struct hashmap		*per_pkg_mask;
 	struct evsel		*leader;
 	struct list_head	config_terms;
@@ -130,8 +134,24 @@ struct evsel {
 	 * See also evsel__has_callchain().
 	 */
 	__u64			synth_sample_type;
-	struct list_head	bpf_counter_list;
+
+	/*
+	 * bpf_counter_ops serves two use cases:
+	 *   1. perf-stat -b          counting events used byBPF programs
+	 *   2. perf-stat --use-bpf   use BPF programs to aggregate counts
+	 */
 	struct bpf_counter_ops	*bpf_counter_ops;
+
+	/* for perf-stat -b */
+	struct list_head	bpf_counter_list;
+
+	/* for perf-stat --use-bpf */
+	int			bperf_leader_prog_fd;
+	int			bperf_leader_link_fd;
+	union {
+		struct bperf_leader_bpf *leader_skel;
+		struct bperf_follower_bpf *follower_skel;
+	};
 };
 
 struct perf_missing_features {
@@ -157,7 +177,6 @@ struct perf_missing_features {
 extern struct perf_missing_features perf_missing_features;
 
 struct perf_cpu_map;
-struct target;
 struct thread_map;
 struct record_opts;
 
@@ -202,7 +221,7 @@ static inline struct evsel *evsel__newtp(const char *sys, const char *name)
 	return evsel__newtp_idx(sys, name, 0);
 }
 
-struct evsel *evsel__new_cycles(bool precise);
+struct evsel *evsel__new_cycles(bool precise, __u32 type, __u64 config);
 
 struct tep_event *event_format__new(const char *sys, const char *name);
 
@@ -222,6 +241,11 @@ void evsel__calc_id_pos(struct evsel *evsel);
 
 bool evsel__is_cache_op_valid(u8 type, u8 op);
 
+static inline bool evsel__is_bpf(struct evsel *evsel)
+{
+	return evsel->bpf_counter_ops != NULL;
+}
+
 #define EVSEL__MAX_ALIASES 8
 
 extern const char *evsel__hw_cache[PERF_COUNT_HW_CACHE_MAX][EVSEL__MAX_ALIASES];
@@ -229,6 +253,9 @@ extern const char *evsel__hw_cache_op[PERF_COUNT_HW_CACHE_OP_MAX][EVSEL__MAX_ALI
 extern const char *evsel__hw_cache_result[PERF_COUNT_HW_CACHE_RESULT_MAX][EVSEL__MAX_ALIASES];
 extern const char *evsel__hw_names[PERF_COUNT_HW_MAX];
 extern const char *evsel__sw_names[PERF_COUNT_SW_MAX];
+extern char *evsel__bpf_counter_events;
+bool evsel__match_bpf_counter_events(const char *name);
+
 int __evsel__hw_cache_type_op_res_name(u8 type, u8 op, u8 result, char *bf, size_t size);
 const char *evsel__name(struct evsel *evsel);
 
@@ -435,4 +462,5 @@ struct perf_env *evsel__env(struct evsel *evsel);
 int evsel__store_ids(struct evsel *evsel, struct evlist *evlist);
 
 void evsel__zero_per_pkg(struct evsel *evsel);
+bool evsel__is_hybrid(struct evsel *evsel);
 #endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
index dcf8d19b83c8..85df3e4771e4 100644
--- a/tools/perf/util/expr.h
+++ b/tools/perf/util/expr.h
@@ -3,7 +3,7 @@
 #define PARSE_CTX_H 1
 
 // There are fixes that need to land upstream before we can use libbpf's headers,
-// for now use our copy uncoditionally, since the data structures at this point
+// for now use our copy unconditionally, since the data structures at this point
 // are exactly the same, no problem.
 //#ifdef HAVE_LIBBPF_SUPPORT
 //#include <bpf/hashmap.h>
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 20effdff76ce..aa1e42518d37 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -127,7 +127,7 @@ static int __do_write_buf(struct feat_fd *ff,  const void *buf, size_t size)
 	return 0;
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 int do_write(struct feat_fd *ff, const void *buf, size_t size)
 {
 	if (!ff->buf)
@@ -135,7 +135,7 @@ int do_write(struct feat_fd *ff, const void *buf, size_t size)
 	return __do_write_buf(ff, buf, size);
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 static int do_write_bitmap(struct feat_fd *ff, unsigned long *set, u64 size)
 {
 	u64 *p = (u64 *) set;
@@ -154,7 +154,7 @@ static int do_write_bitmap(struct feat_fd *ff, unsigned long *set, u64 size)
 	return 0;
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 int write_padded(struct feat_fd *ff, const void *bf,
 		 size_t count, size_t count_aligned)
 {
@@ -170,7 +170,7 @@ int write_padded(struct feat_fd *ff, const void *bf,
 #define string_size(str)						\
 	(PERF_ALIGN((strlen(str) + 1), NAME_ALIGN) + sizeof(u32))
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 static int do_write_string(struct feat_fd *ff, const char *str)
 {
 	u32 len, olen;
@@ -266,7 +266,7 @@ static char *do_read_string(struct feat_fd *ff)
 	return NULL;
 }
 
-/* Return: 0 if succeded, -ERR if failed. */
+/* Return: 0 if succeeded, -ERR if failed. */
 static int do_read_bitmap(struct feat_fd *ff, unsigned long **pset, u64 *psize)
 {
 	unsigned long *set;
@@ -2874,7 +2874,7 @@ static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused)
 	int err = -1;
 
 	if (ff->ph->needs_swap) {
-		pr_warning("interpreting bpf_prog_info from systems with endianity is not yet supported\n");
+		pr_warning("interpreting bpf_prog_info from systems with endianness is not yet supported\n");
 		return 0;
 	}
 
@@ -2942,7 +2942,7 @@ static int process_bpf_btf(struct feat_fd *ff, void *data __maybe_unused)
 	int err = -1;
 
 	if (ff->ph->needs_swap) {
-		pr_warning("interpreting btf from systems with endianity is not yet supported\n");
+		pr_warning("interpreting btf from systems with endianness is not yet supported\n");
 		return 0;
 	}
 
@@ -3481,11 +3481,11 @@ static const size_t attr_pipe_abi_sizes[] = {
 };
 
 /*
- * In the legacy pipe format, there is an implicit assumption that endiannesss
+ * In the legacy pipe format, there is an implicit assumption that endianness
  * between host recording the samples, and host parsing the samples is the
  * same. This is not always the case given that the pipe output may always be
  * redirected into a file and analyzed on a different machine with possibly a
- * different endianness and perf_event ABI revsions in the perf tool itself.
+ * different endianness and perf_event ABI revisions in the perf tool itself.
  */
 static int try_all_pipe_abis(uint64_t hdr_sz, struct perf_header *ph)
 {
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index c82f5fc26af8..65fe65ba03c2 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -211,6 +211,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 	hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10);
 	hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13);
 	hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13);
+	hists__new_col_len(hists, HISTC_P_STAGE_CYC, 13);
 	if (symbol_conf.nanosecs)
 		hists__new_col_len(hists, HISTC_TIME, 16);
 	else
@@ -289,13 +290,14 @@ static long hist_time(unsigned long htime)
 }
 
 static void he_stat__add_period(struct he_stat *he_stat, u64 period,
-				u64 weight, u64 ins_lat)
+				u64 weight, u64 ins_lat, u64 p_stage_cyc)
 {
 
 	he_stat->period		+= period;
 	he_stat->weight		+= weight;
 	he_stat->nr_events	+= 1;
 	he_stat->ins_lat	+= ins_lat;
+	he_stat->p_stage_cyc	+= p_stage_cyc;
 }
 
 static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
@@ -308,6 +310,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src)
 	dest->nr_events		+= src->nr_events;
 	dest->weight		+= src->weight;
 	dest->ins_lat		+= src->ins_lat;
+	dest->p_stage_cyc		+= src->p_stage_cyc;
 }
 
 static void he_stat__decay(struct he_stat *he_stat)
@@ -597,6 +600,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 	u64 period = entry->stat.period;
 	u64 weight = entry->stat.weight;
 	u64 ins_lat = entry->stat.ins_lat;
+	u64 p_stage_cyc = entry->stat.p_stage_cyc;
 	bool leftmost = true;
 
 	p = &hists->entries_in->rb_root.rb_node;
@@ -615,11 +619,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 
 		if (!cmp) {
 			if (sample_self) {
-				he_stat__add_period(&he->stat, period, weight, ins_lat);
+				he_stat__add_period(&he->stat, period, weight, ins_lat, p_stage_cyc);
 				hist_entry__add_callchain_period(he, period);
 			}
 			if (symbol_conf.cumulate_callchain)
-				he_stat__add_period(he->stat_acc, period, weight, ins_lat);
+				he_stat__add_period(he->stat_acc, period, weight, ins_lat, p_stage_cyc);
 
 			/*
 			 * This mem info was allocated from sample__resolve_mem
@@ -731,6 +735,7 @@ __hists__add_entry(struct hists *hists,
 			.period	= sample->period,
 			.weight = sample->weight,
 			.ins_lat = sample->ins_lat,
+			.p_stage_cyc = sample->p_stage_cyc,
 		},
 		.parent = sym_parent,
 		.filtered = symbol__parent_filter(sym_parent) | al->filtered,
@@ -2320,14 +2325,19 @@ void events_stats__inc(struct events_stats *stats, u32 type)
 	++stats->nr_events[type];
 }
 
-void hists__inc_nr_events(struct hists *hists, u32 type)
+static void hists_stats__inc(struct hists_stats *stats)
 {
-	events_stats__inc(&hists->stats, type);
+	++stats->nr_samples;
+}
+
+void hists__inc_nr_events(struct hists *hists)
+{
+	hists_stats__inc(&hists->stats);
 }
 
 void hists__inc_nr_samples(struct hists *hists, bool filtered)
 {
-	events_stats__inc(&hists->stats, PERF_RECORD_SAMPLE);
+	hists_stats__inc(&hists->stats);
 	if (!filtered)
 		hists->stats.nr_non_filtered_samples++;
 }
@@ -2666,14 +2676,21 @@ void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
 	}
 }
 
-size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp)
+size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp,
+				 bool skip_empty)
 {
 	struct evsel *pos;
 	size_t ret = 0;
 
 	evlist__for_each_entry(evlist, pos) {
+		struct hists *hists = evsel__hists(pos);
+
+		if (skip_empty && !hists->stats.nr_samples)
+			continue;
+
 		ret += fprintf(fp, "%s stats:\n", evsel__name(pos));
-		ret += events_stats__fprintf(&evsel__hists(pos)->stats, fp);
+		ret += fprintf(fp, "%16s events: %10d\n",
+			       "SAMPLE", hists->stats.nr_samples);
 	}
 
 	return ret;
@@ -2693,7 +2710,7 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh
 	const struct dso *dso = hists->dso_filter;
 	struct thread *thread = hists->thread_filter;
 	int socket_id = hists->socket_filter;
-	unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+	unsigned long nr_samples = hists->stats.nr_samples;
 	u64 nr_events = hists->stats.total_period;
 	struct evsel *evsel = hists_to_evsel(hists);
 	const char *ev_name = evsel__name(evsel);
@@ -2720,7 +2737,7 @@ int __hists__scnprintf_title(struct hists *hists, char *bf, size_t size, bool sh
 				nr_samples += pos_hists->stats.nr_non_filtered_samples;
 				nr_events += pos_hists->stats.total_non_filtered_period;
 			} else {
-				nr_samples += pos_hists->stats.nr_events[PERF_RECORD_SAMPLE];
+				nr_samples += pos_hists->stats.nr_samples;
 				nr_events += pos_hists->stats.total_period;
 			}
 		}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 3c537232294b..5343b62476e6 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -75,6 +75,7 @@ enum hist_column {
 	HISTC_MEM_BLOCKED,
 	HISTC_LOCAL_INS_LAT,
 	HISTC_GLOBAL_INS_LAT,
+	HISTC_P_STAGE_CYC,
 	HISTC_NR_COLS, /* Last entry */
 };
 
@@ -95,7 +96,7 @@ struct hists {
 	const char		*uid_filter_str;
 	const char		*symbol_filter_str;
 	pthread_mutex_t		lock;
-	struct events_stats	stats;
+	struct hists_stats	stats;
 	u64			event_stream;
 	u16			col_len[HISTC_NR_COLS];
 	bool			has_callchains;
@@ -195,13 +196,14 @@ struct hist_entry *hists__get_entry(struct hists *hists, int idx);
 u64 hists__total_period(struct hists *hists);
 void hists__reset_stats(struct hists *hists);
 void hists__inc_stats(struct hists *hists, struct hist_entry *h);
-void hists__inc_nr_events(struct hists *hists, u32 type);
+void hists__inc_nr_events(struct hists *hists);
 void hists__inc_nr_samples(struct hists *hists, bool filtered);
 
 size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 		      int max_cols, float min_pcnt, FILE *fp,
 		      bool ignore_callchains);
-size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp);
+size_t evlist__fprintf_nr_events(struct evlist *evlist, FILE *fp,
+				 bool skip_empty);
 
 void hists__filter_by_dso(struct hists *hists);
 void hists__filter_by_thread(struct hists *hists);
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index f6e28ac231b7..8658d42ce57a 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -3569,7 +3569,7 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
 	/*
 	 * Since this thread will not be kept in any rbtree not in a
 	 * list, initialize its list node so that at thread__put() the
-	 * current thread lifetime assuption is kept and we don't segfault
+	 * current thread lifetime assumption is kept and we don't segfault
 	 * at list_del_init().
 	 */
 	INIT_LIST_HEAD(&pt->unknown_thread->node);
diff --git a/tools/perf/util/iostat.c b/tools/perf/util/iostat.c
new file mode 100644
index 000000000000..57dd49da28fe
--- /dev/null
+++ b/tools/perf/util/iostat.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "util/iostat.h"
+#include "util/debug.h"
+
+enum iostat_mode_t iostat_mode = IOSTAT_NONE;
+
+__weak int iostat_prepare(struct evlist *evlist __maybe_unused,
+			  struct perf_stat_config *config __maybe_unused)
+{
+	return -1;
+}
+
+__weak int iostat_parse(const struct option *opt __maybe_unused,
+			 const char *str __maybe_unused,
+			 int unset __maybe_unused)
+{
+	pr_err("iostat mode is not supported on current platform\n");
+	return -1;
+}
+
+__weak void iostat_list(struct evlist *evlist __maybe_unused,
+		       struct perf_stat_config *config __maybe_unused)
+{
+}
+
+__weak void iostat_release(struct evlist *evlist __maybe_unused)
+{
+}
+
+__weak void iostat_print_header_prefix(struct perf_stat_config *config __maybe_unused)
+{
+}
+
+__weak void iostat_print_metric(struct perf_stat_config *config __maybe_unused,
+				struct evsel *evsel __maybe_unused,
+				struct perf_stat_output_ctx *out __maybe_unused)
+{
+}
+
+__weak void iostat_prefix(struct evlist *evlist __maybe_unused,
+			  struct perf_stat_config *config __maybe_unused,
+			  char *prefix __maybe_unused,
+			  struct timespec *ts __maybe_unused)
+{
+}
+
+__weak void iostat_print_counters(struct evlist *evlist __maybe_unused,
+				  struct perf_stat_config *config __maybe_unused,
+				  struct timespec *ts __maybe_unused,
+				  char *prefix __maybe_unused,
+				  iostat_print_counter_t print_cnt_cb __maybe_unused)
+{
+}
diff --git a/tools/perf/util/iostat.h b/tools/perf/util/iostat.h
new file mode 100644
index 000000000000..23c1c46a331a
--- /dev/null
+++ b/tools/perf/util/iostat.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * perf iostat
+ *
+ * Copyright (C) 2020, Intel Corporation
+ *
+ * Authors: Alexander Antonov <alexander.antonov@linux.intel.com>
+ */
+
+#ifndef _IOSTAT_H
+#define _IOSTAT_H
+
+#include <subcmd/parse-options.h>
+#include "util/stat.h"
+#include "util/parse-events.h"
+#include "util/evlist.h"
+
+struct option;
+struct perf_stat_config;
+struct evlist;
+struct timespec;
+
+enum iostat_mode_t {
+	IOSTAT_NONE		= -1,
+	IOSTAT_RUN		= 0,
+	IOSTAT_LIST		= 1
+};
+
+extern enum iostat_mode_t iostat_mode;
+
+typedef void (*iostat_print_counter_t)(struct perf_stat_config *, struct evsel *, char *);
+
+int iostat_prepare(struct evlist *evlist, struct perf_stat_config *config);
+int iostat_parse(const struct option *opt, const char *str,
+		 int unset __maybe_unused);
+void iostat_list(struct evlist *evlist, struct perf_stat_config *config);
+void iostat_release(struct evlist *evlist);
+void iostat_prefix(struct evlist *evlist, struct perf_stat_config *config,
+		   char *prefix, struct timespec *ts);
+void iostat_print_header_prefix(struct perf_stat_config *config);
+void iostat_print_metric(struct perf_stat_config *config, struct evsel *evsel,
+			 struct perf_stat_output_ctx *out);
+void iostat_print_counters(struct evlist *evlist,
+			   struct perf_stat_config *config, struct timespec *ts,
+			   char *prefix, iostat_print_counter_t print_cnt_cb);
+
+#endif /* _IOSTAT_H */
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index 9760d8e7b386..917a9c707371 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -396,21 +396,31 @@ static pid_t jr_entry_tid(struct jit_buf_desc *jd, union jr_entry *jr)
 
 static uint64_t convert_timestamp(struct jit_buf_desc *jd, uint64_t timestamp)
 {
-	struct perf_tsc_conversion tc;
+	struct perf_tsc_conversion tc = { .time_shift = 0, };
+	struct perf_record_time_conv *time_conv = &jd->session->time_conv;
 
 	if (!jd->use_arch_timestamp)
 		return timestamp;
 
-	tc.time_shift	       = jd->session->time_conv.time_shift;
-	tc.time_mult	       = jd->session->time_conv.time_mult;
-	tc.time_zero	       = jd->session->time_conv.time_zero;
-	tc.time_cycles	       = jd->session->time_conv.time_cycles;
-	tc.time_mask	       = jd->session->time_conv.time_mask;
-	tc.cap_user_time_zero  = jd->session->time_conv.cap_user_time_zero;
-	tc.cap_user_time_short = jd->session->time_conv.cap_user_time_short;
+	tc.time_shift = time_conv->time_shift;
+	tc.time_mult  = time_conv->time_mult;
+	tc.time_zero  = time_conv->time_zero;
 
-	if (!tc.cap_user_time_zero)
-		return 0;
+	/*
+	 * The event TIME_CONV was extended for the fields from "time_cycles"
+	 * when supported cap_user_time_short, for backward compatibility,
+	 * checks the event size and assigns these extended fields if these
+	 * fields are contained in the event.
+	 */
+	if (event_contains(*time_conv, time_cycles)) {
+		tc.time_cycles	       = time_conv->time_cycles;
+		tc.time_mask	       = time_conv->time_mask;
+		tc.cap_user_time_zero  = time_conv->cap_user_time_zero;
+		tc.cap_user_time_short = time_conv->cap_user_time_short;
+
+		if (!tc.cap_user_time_zero)
+			return 0;
+	}
 
 	return tsc_to_perf_time(timestamp, &tc);
 }
diff --git a/tools/perf/util/levenshtein.c b/tools/perf/util/levenshtein.c
index a217ecf0359d..6a6712635aa4 100644
--- a/tools/perf/util/levenshtein.c
+++ b/tools/perf/util/levenshtein.c
@@ -30,7 +30,7 @@
  *
  * It does so by calculating the costs of the path ending in characters
  * i (in string1) and j (in string2), respectively, given that the last
- * operation is a substition, a swap, a deletion, or an insertion.
+ * operation is a substitution, a swap, a deletion, or an insertion.
  *
  * This implementation allows the costs to be weighted:
  *
diff --git a/tools/perf/util/libunwind/arm64.c b/tools/perf/util/libunwind/arm64.c
index 6b4e5a0892f8..c397be0c2e32 100644
--- a/tools/perf/util/libunwind/arm64.c
+++ b/tools/perf/util/libunwind/arm64.c
@@ -4,7 +4,7 @@
  * generic one.
  *
  * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch
- * name and the defination of this function is included directly from
+ * name and the definition of this function is included directly from
  * 'arch/arm64/util/unwind-libunwind.c', to make sure that this function
  * is defined no matter what arch the host is.
  *
diff --git a/tools/perf/util/libunwind/x86_32.c b/tools/perf/util/libunwind/x86_32.c
index 21c216c40a3b..b2b92d030aef 100644
--- a/tools/perf/util/libunwind/x86_32.c
+++ b/tools/perf/util/libunwind/x86_32.c
@@ -4,7 +4,7 @@
  * generic one.
  *
  * The function 'LIBUNWIND__ARCH_REG_ID' name is set according to arch
- * name and the defination of this function is included directly from
+ * name and the definition of this function is included directly from
  * 'arch/x86/util/unwind-libunwind.c', to make sure that this function
  * is defined no matter what arch the host is.
  *
diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c
index dbdffb6673fe..3ceaf7ef3301 100644
--- a/tools/perf/util/llvm-utils.c
+++ b/tools/perf/util/llvm-utils.c
@@ -471,7 +471,7 @@ int llvm__compile_bpf(const char *path, void **p_obj_buf,
 
 	/*
 	 * This is an optional work. Even it fail we can continue our
-	 * work. Needn't to check error return.
+	 * work. Needn't check error return.
 	 */
 	llvm__get_kbuild_opts(&kbuild_dir, &kbuild_include_opts);
 
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index b5c2d8be4144..3ff4936a15a4 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -905,7 +905,7 @@ static struct map *machine__addnew_module_map(struct machine *machine, u64 start
 
 	maps__insert(&machine->kmaps, map);
 
-	/* Put the map here because maps__insert alread got it */
+	/* Put the map here because maps__insert already got it */
 	map__put(map);
 out:
 	/* put the dso here, corresponding to  machine__findnew_module_dso */
@@ -1952,7 +1952,7 @@ int machine__process_fork_event(struct machine *machine, union perf_event *event
 	 * maps because that is what the kernel just did.
 	 *
 	 * But when synthesizing, this should not be done.  If we do, we end up
-	 * with overlapping maps as we process the sythesized MMAP2 events that
+	 * with overlapping maps as we process the synthesized MMAP2 events that
 	 * get delivered shortly thereafter.
 	 *
 	 * Use the FORK event misc flags in an internal way to signal this
@@ -2038,8 +2038,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
 static bool symbol__match_regex(struct symbol *sym, regex_t *regex)
 {
 	if (!regexec(regex, sym->name, 0, NULL, 0))
-		return 1;
-	return 0;
+		return true;
+	return false;
 }
 
 static void ip__resolve_ams(struct thread *thread,
@@ -2518,7 +2518,7 @@ static bool has_stitched_lbr(struct thread *thread,
 
 	/*
 	 * Check if there are identical LBRs between two samples.
-	 * Identicall LBRs must have same from, to and flags values. Also,
+	 * Identical LBRs must have same from, to and flags values. Also,
 	 * they have to be saved in the same LBR registers (same physical
 	 * index).
 	 *
@@ -2588,7 +2588,7 @@ err:
 }
 
 /*
- * Recolve LBR callstack chain sample
+ * Resolve LBR callstack chain sample
  * Return:
  * 1 on success get LBR callchain information
  * 0 no available LBR callchain information, should try fp
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 9f32825c98d8..d32f5b28c1fb 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -75,7 +75,7 @@ struct thread;
 
 /* map__for_each_symbol - iterate over the symbols in the given map
  *
- * @map: the 'struct map *' in which symbols itereated
+ * @map: the 'struct map *' in which symbols are iterated
  * @pos: the 'struct symbol *' to use as a loop cursor
  * @n: the 'struct rb_node *' to use as a temporary storage
  * Note: caller must ensure map->dso is not NULL (map is loaded).
@@ -86,7 +86,7 @@ struct thread;
 /* map__for_each_symbol_with_name - iterate over the symbols in the given map
  *                                  that have the given name
  *
- * @map: the 'struct map *' in which symbols itereated
+ * @map: the 'struct map *' in which symbols are iterated
  * @sym_name: the symbol name
  * @pos: the 'struct symbol *' to use as a loop cursor
  */
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 755cef7e0625..cacdebd65b8a 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -44,7 +44,6 @@ bool is_mem_loads_aux_event(struct evsel *leader);
 
 void perf_mem_events__list(void);
 
-struct mem_info;
 int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
@@ -81,7 +80,7 @@ struct c2c_stats {
 	u32	rmt_dram;            /* count of loads miss to remote DRAM */
 	u32	blk_data;            /* count of loads blocked by data */
 	u32	blk_addr;            /* count of loads blocked by address conflict */
-	u32	nomap;               /* count of load/stores with no phys adrs */
+	u32	nomap;               /* count of load/stores with no phys addrs */
 	u32	noparse;             /* count of unparsable data sources */
 };
 
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 26c990e32378..8336dd8e8098 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -181,7 +181,7 @@ static bool evsel_same_pmu(struct evsel *ev1, struct evsel *ev2)
  * @pctx: the parse context for the metric expression.
  * @metric_no_merge: don't attempt to share events for the metric with other
  * metrics.
- * @has_constraint: is there a contraint on the group of events? In which case
+ * @has_constraint: is there a constraint on the group of events? In which case
  * the events won't be grouped.
  * @metric_events: out argument, null terminated array of evsel's associated
  * with the metric.
@@ -618,7 +618,7 @@ static int metricgroup__print_sys_event_iter(struct pmu_event *pe, void *data)
 void metricgroup__print(bool metrics, bool metricgroups, char *filter,
 			bool raw, bool details)
 {
-	struct pmu_events_map *map = perf_pmu__find_map(NULL);
+	struct pmu_events_map *map = pmu_events_map__find();
 	struct pmu_event *pe;
 	int i;
 	struct rblist groups;
@@ -900,7 +900,8 @@ static int __add_metric(struct list_head *metric_list,
 		    (match_metric(__pe->metric_group, __metric) ||	\
 		     match_metric(__pe->metric_name, __metric)))
 
-static struct pmu_event *find_metric(const char *metric, struct pmu_events_map *map)
+struct pmu_event *metricgroup__find_metric(const char *metric,
+					   struct pmu_events_map *map)
 {
 	struct pmu_event *pe;
 	int i;
@@ -985,7 +986,7 @@ static int __resolve_metric(struct metric *m,
 			struct expr_id *parent;
 			struct pmu_event *pe;
 
-			pe = find_metric(cur->key, map);
+			pe = metricgroup__find_metric(cur->key, map);
 			if (!pe)
 				continue;
 
@@ -1253,8 +1254,7 @@ int metricgroup__parse_groups(const struct option *opt,
 			      struct rblist *metric_events)
 {
 	struct evlist *perf_evlist = *(struct evlist **)opt->value;
-	struct pmu_events_map *map = perf_pmu__find_map(NULL);
-
+	struct pmu_events_map *map = pmu_events_map__find();
 
 	return parse_groups(perf_evlist, str, metric_no_group,
 			    metric_no_merge, NULL, metric_events, map);
@@ -1273,7 +1273,7 @@ int metricgroup__parse_groups_test(struct evlist *evlist,
 
 bool metricgroup__has_metric(const char *metric)
 {
-	struct pmu_events_map *map = perf_pmu__find_map(NULL);
+	struct pmu_events_map *map = pmu_events_map__find();
 	struct pmu_event *pe;
 	int i;
 
diff --git a/tools/perf/util/metricgroup.h b/tools/perf/util/metricgroup.h
index ed1b9392e624..cc4a92492a61 100644
--- a/tools/perf/util/metricgroup.h
+++ b/tools/perf/util/metricgroup.h
@@ -9,7 +9,6 @@
 
 struct evlist;
 struct evsel;
-struct evlist;
 struct option;
 struct rblist;
 struct pmu_events_map;
@@ -44,7 +43,8 @@ int metricgroup__parse_groups(const struct option *opt,
 			      bool metric_no_group,
 			      bool metric_no_merge,
 			      struct rblist *metric_events);
-
+struct pmu_event *metricgroup__find_metric(const char *metric,
+					   struct pmu_events_map *map);
 int metricgroup__parse_groups_test(struct evlist *evlist,
 				   struct pmu_events_map *map,
 				   const char *str,
diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c
new file mode 100644
index 000000000000..10160ab126f9
--- /dev/null
+++ b/tools/perf/util/parse-events-hybrid.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/err.h>
+#include <linux/zalloc.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/param.h>
+#include "evlist.h"
+#include "evsel.h"
+#include "parse-events.h"
+#include "parse-events-hybrid.h"
+#include "debug.h"
+#include "pmu.h"
+#include "pmu-hybrid.h"
+#include "perf.h"
+
+static void config_hybrid_attr(struct perf_event_attr *attr,
+			       int type, int pmu_type)
+{
+	/*
+	 * attr.config layout for type PERF_TYPE_HARDWARE and
+	 * PERF_TYPE_HW_CACHE
+	 *
+	 * PERF_TYPE_HARDWARE:                 0xEEEEEEEE000000AA
+	 *                                     AA: hardware event ID
+	 *                                     EEEEEEEE: PMU type ID
+	 * PERF_TYPE_HW_CACHE:                 0xEEEEEEEE00DDCCBB
+	 *                                     BB: hardware cache ID
+	 *                                     CC: hardware cache op ID
+	 *                                     DD: hardware cache op result ID
+	 *                                     EEEEEEEE: PMU type ID
+	 * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied.
+	 */
+	attr->type = type;
+	attr->config = attr->config | ((__u64)pmu_type << PERF_PMU_TYPE_SHIFT);
+}
+
+static int create_event_hybrid(__u32 config_type, int *idx,
+			       struct list_head *list,
+			       struct perf_event_attr *attr, char *name,
+			       struct list_head *config_terms,
+			       struct perf_pmu *pmu)
+{
+	struct evsel *evsel;
+	__u32 type = attr->type;
+	__u64 config = attr->config;
+
+	config_hybrid_attr(attr, config_type, pmu->type);
+	evsel = parse_events__add_event_hybrid(list, idx, attr, name,
+					       pmu, config_terms);
+	if (evsel)
+		evsel->pmu_name = strdup(pmu->name);
+	else
+		return -ENOMEM;
+
+	attr->type = type;
+	attr->config = config;
+	return 0;
+}
+
+static int pmu_cmp(struct parse_events_state *parse_state,
+		   struct perf_pmu *pmu)
+{
+	if (!parse_state->hybrid_pmu_name)
+		return 0;
+
+	return strcmp(parse_state->hybrid_pmu_name, pmu->name);
+}
+
+static int add_hw_hybrid(struct parse_events_state *parse_state,
+			 struct list_head *list, struct perf_event_attr *attr,
+			 char *name, struct list_head *config_terms)
+{
+	struct perf_pmu *pmu;
+	int ret;
+
+	perf_pmu__for_each_hybrid_pmu(pmu) {
+		if (pmu_cmp(parse_state, pmu))
+			continue;
+
+		ret = create_event_hybrid(PERF_TYPE_HARDWARE,
+					  &parse_state->idx, list, attr, name,
+					  config_terms, pmu);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int create_raw_event_hybrid(int *idx, struct list_head *list,
+				   struct perf_event_attr *attr, char *name,
+				   struct list_head *config_terms,
+				   struct perf_pmu *pmu)
+{
+	struct evsel *evsel;
+
+	attr->type = pmu->type;
+	evsel = parse_events__add_event_hybrid(list, idx, attr, name,
+					       pmu, config_terms);
+	if (evsel)
+		evsel->pmu_name = strdup(pmu->name);
+	else
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int add_raw_hybrid(struct parse_events_state *parse_state,
+			  struct list_head *list, struct perf_event_attr *attr,
+			  char *name, struct list_head *config_terms)
+{
+	struct perf_pmu *pmu;
+	int ret;
+
+	perf_pmu__for_each_hybrid_pmu(pmu) {
+		if (pmu_cmp(parse_state, pmu))
+			continue;
+
+		ret = create_raw_event_hybrid(&parse_state->idx, list, attr,
+					      name, config_terms, pmu);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state,
+				     struct list_head *list,
+				     struct perf_event_attr *attr,
+				     char *name, struct list_head *config_terms,
+				     bool *hybrid)
+{
+	*hybrid = false;
+	if (attr->type == PERF_TYPE_SOFTWARE)
+		return 0;
+
+	if (!perf_pmu__has_hybrid())
+		return 0;
+
+	*hybrid = true;
+	if (attr->type != PERF_TYPE_RAW) {
+		return add_hw_hybrid(parse_state, list, attr, name,
+				     config_terms);
+	}
+
+	return add_raw_hybrid(parse_state, list, attr, name,
+			      config_terms);
+}
+
+int parse_events__add_cache_hybrid(struct list_head *list, int *idx,
+				   struct perf_event_attr *attr, char *name,
+				   struct list_head *config_terms,
+				   bool *hybrid,
+				   struct parse_events_state *parse_state)
+{
+	struct perf_pmu *pmu;
+	int ret;
+
+	*hybrid = false;
+	if (!perf_pmu__has_hybrid())
+		return 0;
+
+	*hybrid = true;
+	perf_pmu__for_each_hybrid_pmu(pmu) {
+		if (pmu_cmp(parse_state, pmu))
+			continue;
+
+		ret = create_event_hybrid(PERF_TYPE_HW_CACHE, idx, list,
+					  attr, name, config_terms, pmu);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/parse-events-hybrid.h b/tools/perf/util/parse-events-hybrid.h
new file mode 100644
index 000000000000..f33bd67aa851
--- /dev/null
+++ b/tools/perf/util/parse-events-hybrid.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_PARSE_EVENTS_HYBRID_H
+#define __PERF_PARSE_EVENTS_HYBRID_H
+
+#include <linux/list.h>
+#include <stdbool.h>
+#include <linux/types.h>
+#include <linux/perf_event.h>
+#include <string.h>
+
+int parse_events__add_numeric_hybrid(struct parse_events_state *parse_state,
+				     struct list_head *list,
+				     struct perf_event_attr *attr,
+				     char *name, struct list_head *config_terms,
+				     bool *hybrid);
+
+int parse_events__add_cache_hybrid(struct list_head *list, int *idx,
+				   struct perf_event_attr *attr, char *name,
+				   struct list_head *config_terms,
+				   bool *hybrid,
+				   struct parse_events_state *parse_state);
+
+#endif /* __PERF_PARSE_EVENTS_HYBRID_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index c0c0fab22cb8..4dad14265b81 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -37,6 +37,8 @@
 #include "util/evsel_config.h"
 #include "util/event.h"
 #include "util/pfm.h"
+#include "util/parse-events-hybrid.h"
+#include "util/pmu-hybrid.h"
 #include "perf.h"
 
 #define MAX_NAME_LEN 100
@@ -47,6 +49,9 @@ extern int parse_events_debug;
 int parse_events_parse(void *parse_state, void *scanner);
 static int get_config_terms(struct list_head *head_config,
 			    struct list_head *head_terms __maybe_unused);
+static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state,
+					 const char *str, char *pmu_name,
+					 struct list_head *list);
 
 static struct perf_pmu_event_symbol *perf_pmu_events_list;
 /*
@@ -452,14 +457,16 @@ static int config_attr(struct perf_event_attr *attr,
 int parse_events_add_cache(struct list_head *list, int *idx,
 			   char *type, char *op_result1, char *op_result2,
 			   struct parse_events_error *err,
-			   struct list_head *head_config)
+			   struct list_head *head_config,
+			   struct parse_events_state *parse_state)
 {
 	struct perf_event_attr attr;
 	LIST_HEAD(config_terms);
 	char name[MAX_NAME_LEN], *config_name;
 	int cache_type = -1, cache_op = -1, cache_result = -1;
 	char *op_result[2] = { op_result1, op_result2 };
-	int i, n;
+	int i, n, ret;
+	bool hybrid;
 
 	/*
 	 * No fallback - if we cannot get a clear cache type
@@ -519,6 +526,13 @@ int parse_events_add_cache(struct list_head *list, int *idx,
 		if (get_config_terms(head_config, &config_terms))
 			return -ENOMEM;
 	}
+
+	ret = parse_events__add_cache_hybrid(list, idx, &attr,
+					     config_name ? : name, &config_terms,
+					     &hybrid, parse_state);
+	if (hybrid)
+		return ret;
+
 	return add_event(list, idx, &attr, config_name ? : name, &config_terms);
 }
 
@@ -846,9 +860,9 @@ split_bpf_config_terms(struct list_head *evt_head_config,
 	struct parse_events_term *term, *temp;
 
 	/*
-	 * Currectly, all possible user config term
+	 * Currently, all possible user config term
 	 * belong to bpf object. parse_events__is_hardcoded_term()
-	 * happends to be a good flag.
+	 * happens to be a good flag.
 	 *
 	 * See parse_events_config_bpf() and
 	 * config_term_tracepoint().
@@ -898,7 +912,7 @@ int parse_events_load_bpf(struct parse_events_state *parse_state,
 
 	/*
 	 * Caller doesn't know anything about obj_head_config,
-	 * so combine them together again before returnning.
+	 * so combine them together again before returning.
 	 */
 	if (head_config)
 		list_splice_tail(&obj_head_config, head_config);
@@ -1185,10 +1199,10 @@ do {									   \
 	}
 
 	/*
-	 * Check term availbility after basic checking so
+	 * Check term availability after basic checking so
 	 * PARSE_EVENTS__TERM_TYPE_USER can be found and filtered.
 	 *
-	 * If check availbility at the entry of this function,
+	 * If check availability at the entry of this function,
 	 * user will see "'<sysfs term>' is not usable in 'perf stat'"
 	 * if an invalid config term is provided for legacy events
 	 * (for example, instructions/badterm/...), which is confusing.
@@ -1419,6 +1433,8 @@ int parse_events_add_numeric(struct parse_events_state *parse_state,
 {
 	struct perf_event_attr attr;
 	LIST_HEAD(config_terms);
+	bool hybrid;
+	int ret;
 
 	memset(&attr, 0, sizeof(attr));
 	attr.type = type;
@@ -1433,6 +1449,12 @@ int parse_events_add_numeric(struct parse_events_state *parse_state,
 			return -ENOMEM;
 	}
 
+	ret = parse_events__add_numeric_hybrid(parse_state, list, &attr,
+					       get_config_name(head_config),
+					       &config_terms, &hybrid);
+	if (hybrid)
+		return ret;
+
 	return add_event(list, &parse_state->idx, &attr,
 			 get_config_name(head_config), &config_terms);
 }
@@ -1456,6 +1478,33 @@ static bool config_term_percore(struct list_head *config_terms)
 	return false;
 }
 
+static int parse_events__inside_hybrid_pmu(struct parse_events_state *parse_state,
+					   struct list_head *list, char *name,
+					   struct list_head *head_config)
+{
+	struct parse_events_term *term;
+	int ret = -1;
+
+	if (parse_state->fake_pmu || !head_config || list_empty(head_config) ||
+	    !perf_pmu__is_hybrid(name)) {
+		return -1;
+	}
+
+	/*
+	 * More than one term in list.
+	 */
+	if (head_config->next && head_config->next->next != head_config)
+		return -1;
+
+	term = list_first_entry(head_config, struct parse_events_term, list);
+	if (term && term->config && strcmp(term->config, "event")) {
+		ret = parse_events__with_hybrid_pmu(parse_state, term->config,
+						    name, list);
+	}
+
+	return ret;
+}
+
 int parse_events_add_pmu(struct parse_events_state *parse_state,
 			 struct list_head *list, char *name,
 			 struct list_head *head_config,
@@ -1549,6 +1598,11 @@ int parse_events_add_pmu(struct parse_events_state *parse_state,
 	if (pmu->default_config && get_config_chgs(pmu, head_config, &config_terms))
 		return -ENOMEM;
 
+	if (!parse_events__inside_hybrid_pmu(parse_state, list, name,
+					     head_config)) {
+		return 0;
+	}
+
 	if (!parse_state->fake_pmu && perf_pmu__config(pmu, &attr, head_config, parse_state->error)) {
 		struct evsel_config_term *pos, *tmp;
 
@@ -1567,6 +1621,9 @@ int parse_events_add_pmu(struct parse_events_state *parse_state,
 	if (!evsel)
 		return -ENOMEM;
 
+	if (evsel->name)
+		evsel->use_config_name = true;
+
 	evsel->pmu_name = name ? strdup(name) : NULL;
 	evsel->use_uncore_alias = use_uncore_alias;
 	evsel->percore = config_term_percore(&evsel->config_terms);
@@ -1804,6 +1861,7 @@ struct event_modifier {
 	int pinned;
 	int weak;
 	int exclusive;
+	int bpf_counter;
 };
 
 static int get_event_modifier(struct event_modifier *mod, char *str,
@@ -1824,6 +1882,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
 	int exclude = eu | ek | eh;
 	int exclude_GH = evsel ? evsel->exclude_GH : 0;
 	int weak = 0;
+	int bpf_counter = 0;
 
 	memset(mod, 0, sizeof(*mod));
 
@@ -1867,6 +1926,8 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
 			exclusive = 1;
 		} else if (*str == 'W') {
 			weak = 1;
+		} else if (*str == 'b') {
+			bpf_counter = 1;
 		} else
 			break;
 
@@ -1898,6 +1959,7 @@ static int get_event_modifier(struct event_modifier *mod, char *str,
 	mod->sample_read = sample_read;
 	mod->pinned = pinned;
 	mod->weak = weak;
+	mod->bpf_counter = bpf_counter;
 	mod->exclusive = exclusive;
 
 	return 0;
@@ -1912,7 +1974,7 @@ static int check_modifier(char *str)
 	char *p = str;
 
 	/* The sizeof includes 0 byte as well. */
-	if (strlen(str) > (sizeof("ukhGHpppPSDIWe") - 1))
+	if (strlen(str) > (sizeof("ukhGHpppPSDIWeb") - 1))
 		return -1;
 
 	while (*p) {
@@ -1953,6 +2015,7 @@ int parse_events__modifier_event(struct list_head *list, char *str, bool add)
 		evsel->sample_read         = mod.sample_read;
 		evsel->precise_max         = mod.precise_max;
 		evsel->weak_group	   = mod.weak;
+		evsel->bpf_counter	   = mod.bpf_counter;
 
 		if (evsel__is_group_leader(evsel)) {
 			evsel->core.attr.pinned = mod.pinned;
@@ -2162,6 +2225,33 @@ int parse_events_terms(struct list_head *terms, const char *str)
 	return ret;
 }
 
+static int parse_events__with_hybrid_pmu(struct parse_events_state *parse_state,
+					 const char *str, char *pmu_name,
+					 struct list_head *list)
+{
+	struct parse_events_state ps = {
+		.list            = LIST_HEAD_INIT(ps.list),
+		.stoken          = PE_START_EVENTS,
+		.hybrid_pmu_name = pmu_name,
+		.idx             = parse_state->idx,
+	};
+	int ret;
+
+	ret = parse_events__scanner(str, &ps);
+	perf_pmu__parse_cleanup();
+
+	if (!ret) {
+		if (!list_empty(&ps.list)) {
+			list_splice(&ps.list, list);
+			parse_state->idx = ps.idx;
+			return 0;
+		} else
+			return -1;
+	}
+
+	return ret;
+}
+
 int __parse_events(struct evlist *evlist, const char *str,
 		   struct parse_events_error *err, struct perf_pmu *fake_pmu)
 {
@@ -3185,3 +3275,12 @@ char *parse_events_formats_error_string(char *additional_terms)
 fail:
 	return NULL;
 }
+
+struct evsel *parse_events__add_event_hybrid(struct list_head *list, int *idx,
+					     struct perf_event_attr *attr,
+					     char *name, struct perf_pmu *pmu,
+					     struct list_head *config_terms)
+{
+	return __add_event(list, idx, attr, true, name, pmu,
+			   config_terms, false, NULL);
+}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index e80c9b74f2f2..bf6e41aa9b6a 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -138,6 +138,7 @@ struct parse_events_state {
 	struct list_head	  *terms;
 	int			   stoken;
 	struct perf_pmu		  *fake_pmu;
+	char			  *hybrid_pmu_name;
 };
 
 void parse_events__handle_error(struct parse_events_error *err, int idx,
@@ -188,7 +189,8 @@ int parse_events_add_tool(struct parse_events_state *parse_state,
 int parse_events_add_cache(struct list_head *list, int *idx,
 			   char *type, char *op_result1, char *op_result2,
 			   struct parse_events_error *error,
-			   struct list_head *head_config);
+			   struct list_head *head_config,
+			   struct parse_events_state *parse_state);
 int parse_events_add_breakpoint(struct list_head *list, int *idx,
 				u64 addr, char *type, u64 len);
 int parse_events_add_pmu(struct parse_events_state *parse_state,
@@ -263,4 +265,9 @@ static inline bool is_sdt_event(char *str __maybe_unused)
 
 int perf_pmu__test_parse_init(void);
 
+struct evsel *parse_events__add_event_hybrid(struct list_head *list, int *idx,
+					     struct perf_event_attr *attr,
+					     char *name, struct perf_pmu *pmu,
+					     struct list_head *config_terms);
+
 #endif /* __PERF_PARSE_EVENTS_H */
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 0b36285a9435..fb8646cc3e83 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -210,7 +210,7 @@ name_tag	[\'][a-zA-Z_*?\[\]][a-zA-Z0-9_*?\-,\.\[\]:=]*[\']
 name_minus	[a-zA-Z_*?][a-zA-Z0-9\-_*?.:]*
 drv_cfg_term	[a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)?
 /* If you add a modifier you need to update check_modifier() */
-modifier_event	[ukhpPGHSDIWe]+
+modifier_event	[ukhpPGHSDIWeb]+
 modifier_bp	[rwx]{1,3}
 
 %%
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index d57ac86ce7ca..aba12a4d488e 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -454,7 +454,8 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_e
 
 	list = alloc_list();
 	ABORT_ON(!list);
-	err = parse_events_add_cache(list, &parse_state->idx, $1, $3, $5, error, $6);
+	err = parse_events_add_cache(list, &parse_state->idx, $1, $3, $5, error, $6,
+				     parse_state);
 	parse_events_terms__delete($6);
 	free($1);
 	free($3);
@@ -475,7 +476,8 @@ PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT opt_event_config
 
 	list = alloc_list();
 	ABORT_ON(!list);
-	err = parse_events_add_cache(list, &parse_state->idx, $1, $3, NULL, error, $4);
+	err = parse_events_add_cache(list, &parse_state->idx, $1, $3, NULL, error, $4,
+				     parse_state);
 	parse_events_terms__delete($4);
 	free($1);
 	free($3);
@@ -495,7 +497,8 @@ PE_NAME_CACHE_TYPE opt_event_config
 
 	list = alloc_list();
 	ABORT_ON(!list);
-	err = parse_events_add_cache(list, &parse_state->idx, $1, NULL, NULL, error, $2);
+	err = parse_events_add_cache(list, &parse_state->idx, $1, NULL, NULL, error, $2,
+				     parse_state);
 	parse_events_terms__delete($2);
 	free($1);
 	if (err) {
diff --git a/tools/perf/util/pmu-hybrid.c b/tools/perf/util/pmu-hybrid.c
new file mode 100644
index 000000000000..f51ccaac60ee
--- /dev/null
+++ b/tools/perf/util/pmu-hybrid.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/list.h>
+#include <linux/compiler.h>
+#include <linux/string.h>
+#include <linux/zalloc.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <locale.h>
+#include <api/fs/fs.h>
+#include "fncache.h"
+#include "pmu-hybrid.h"
+
+LIST_HEAD(perf_pmu__hybrid_pmus);
+
+bool perf_pmu__hybrid_mounted(const char *name)
+{
+	char path[PATH_MAX];
+	const char *sysfs;
+	FILE *file;
+	int n, cpu;
+
+	if (strncmp(name, "cpu_", 4))
+		return false;
+
+	sysfs = sysfs__mountpoint();
+	if (!sysfs)
+		return false;
+
+	snprintf(path, PATH_MAX, CPUS_TEMPLATE_CPU, sysfs, name);
+	if (!file_available(path))
+		return false;
+
+	file = fopen(path, "r");
+	if (!file)
+		return false;
+
+	n = fscanf(file, "%u", &cpu);
+	fclose(file);
+	if (n <= 0)
+		return false;
+
+	return true;
+}
+
+struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name)
+{
+	struct perf_pmu *pmu;
+
+	if (!name)
+		return NULL;
+
+	perf_pmu__for_each_hybrid_pmu(pmu) {
+		if (!strcmp(name, pmu->name))
+			return pmu;
+	}
+
+	return NULL;
+}
+
+bool perf_pmu__is_hybrid(const char *name)
+{
+	return perf_pmu__find_hybrid_pmu(name) != NULL;
+}
+
+char *perf_pmu__hybrid_type_to_pmu(const char *type)
+{
+	char *pmu_name = NULL;
+
+	if (asprintf(&pmu_name, "cpu_%s", type) < 0)
+		return NULL;
+
+	if (perf_pmu__is_hybrid(pmu_name))
+		return pmu_name;
+
+	/*
+	 * pmu may be not scanned, check the sysfs.
+	 */
+	if (perf_pmu__hybrid_mounted(pmu_name))
+		return pmu_name;
+
+	free(pmu_name);
+	return NULL;
+}
diff --git a/tools/perf/util/pmu-hybrid.h b/tools/perf/util/pmu-hybrid.h
new file mode 100644
index 000000000000..d0fa7bc50a76
--- /dev/null
+++ b/tools/perf/util/pmu-hybrid.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PMU_HYBRID_H
+#define __PMU_HYBRID_H
+
+#include <linux/perf_event.h>
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <stdbool.h>
+#include "pmu.h"
+
+extern struct list_head perf_pmu__hybrid_pmus;
+
+#define perf_pmu__for_each_hybrid_pmu(pmu)	\
+	list_for_each_entry(pmu, &perf_pmu__hybrid_pmus, hybrid_list)
+
+bool perf_pmu__hybrid_mounted(const char *name);
+
+struct perf_pmu *perf_pmu__find_hybrid_pmu(const char *name);
+bool perf_pmu__is_hybrid(const char *name);
+char *perf_pmu__hybrid_type_to_pmu(const char *type);
+
+#endif /* __PMU_HYBRID_H */
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 46fd0f998484..88c8ecdc60b0 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -25,6 +25,7 @@
 #include "string2.h"
 #include "strbuf.h"
 #include "fncache.h"
+#include "pmu-hybrid.h"
 
 struct perf_pmu perf_pmu__fake;
 
@@ -39,6 +40,7 @@ int perf_pmu_parse(struct list_head *list, char *name);
 extern FILE *perf_pmu_in;
 
 static LIST_HEAD(pmus);
+static bool hybrid_scanned;
 
 /*
  * Parse & process all the sysfs attributes located under
@@ -283,6 +285,7 @@ void perf_pmu_free_alias(struct perf_pmu_alias *newalias)
 	zfree(&newalias->str);
 	zfree(&newalias->metric_expr);
 	zfree(&newalias->metric_name);
+	zfree(&newalias->pmu_name);
 	parse_events_terms__purge(&newalias->terms);
 	free(newalias);
 }
@@ -297,6 +300,10 @@ static bool perf_pmu_merge_alias(struct perf_pmu_alias *newalias,
 
 	list_for_each_entry(a, alist, list) {
 		if (!strcasecmp(newalias->name, a->name)) {
+			if (newalias->pmu_name && a->pmu_name &&
+			    !strcasecmp(newalias->pmu_name, a->pmu_name)) {
+				continue;
+			}
 			perf_pmu_update_alias(a, newalias);
 			perf_pmu_free_alias(newalias);
 			return true;
@@ -306,18 +313,27 @@ static bool perf_pmu_merge_alias(struct perf_pmu_alias *newalias,
 }
 
 static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name,
-				 char *desc, char *val,
-				 char *long_desc, char *topic,
-				 char *unit, char *perpkg,
-				 char *metric_expr,
-				 char *metric_name,
-				 char *deprecated)
+				 char *desc, char *val, struct pmu_event *pe)
 {
 	struct parse_events_term *term;
 	struct perf_pmu_alias *alias;
 	int ret;
 	int num;
 	char newval[256];
+	char *long_desc = NULL, *topic = NULL, *unit = NULL, *perpkg = NULL,
+	     *metric_expr = NULL, *metric_name = NULL, *deprecated = NULL,
+	     *pmu_name = NULL;
+
+	if (pe) {
+		long_desc = (char *)pe->long_desc;
+		topic = (char *)pe->topic;
+		unit = (char *)pe->unit;
+		perpkg = (char *)pe->perpkg;
+		metric_expr = (char *)pe->metric_expr;
+		metric_name = (char *)pe->metric_name;
+		deprecated = (char *)pe->deprecated;
+		pmu_name = (char *)pe->pmu;
+	}
 
 	alias = malloc(sizeof(*alias));
 	if (!alias)
@@ -382,6 +398,7 @@ static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name,
 	}
 	alias->per_pkg = perpkg && sscanf(perpkg, "%d", &num) == 1 && num == 1;
 	alias->str = strdup(newval);
+	alias->pmu_name = pmu_name ? strdup(pmu_name) : NULL;
 
 	if (deprecated)
 		alias->deprecated = true;
@@ -406,8 +423,7 @@ static int perf_pmu__new_alias(struct list_head *list, char *dir, char *name, FI
 	/* Remove trailing newline from sysfs file */
 	strim(buf);
 
-	return __perf_pmu__new_alias(list, dir, name, NULL, buf, NULL, NULL, NULL,
-				     NULL, NULL, NULL, NULL);
+	return __perf_pmu__new_alias(list, dir, name, NULL, buf, NULL);
 }
 
 static inline bool pmu_alias_info_file(char *name)
@@ -599,7 +615,6 @@ static struct perf_cpu_map *__pmu_cpumask(const char *path)
  */
 #define SYS_TEMPLATE_ID	"./bus/event_source/devices/%s/identifier"
 #define CPUS_TEMPLATE_UNCORE	"%s/bus/event_source/devices/%s/cpumask"
-#define CPUS_TEMPLATE_CPU	"%s/bus/event_source/devices/%s/cpus"
 
 static struct perf_cpu_map *pmu_cpumask(const char *name)
 {
@@ -631,6 +646,9 @@ static bool pmu_is_uncore(const char *name)
 	char path[PATH_MAX];
 	const char *sysfs;
 
+	if (perf_pmu__hybrid_mounted(name))
+		return false;
+
 	sysfs = sysfs__mountpoint();
 	snprintf(path, PATH_MAX, CPUS_TEMPLATE_UNCORE, sysfs, name);
 	return file_available(path);
@@ -717,6 +735,11 @@ struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu)
 	return map;
 }
 
+struct pmu_events_map *__weak pmu_events_map__find(void)
+{
+	return perf_pmu__find_map(NULL);
+}
+
 bool pmu_uncore_alias_match(const char *pmu_name, const char *name)
 {
 	char *tmp = NULL, *tok, *str;
@@ -793,11 +816,7 @@ new_alias:
 		/* need type casts to override 'const' */
 		__perf_pmu__new_alias(head, NULL, (char *)pe->name,
 				(char *)pe->desc, (char *)pe->event,
-				(char *)pe->long_desc, (char *)pe->topic,
-				(char *)pe->unit, (char *)pe->perpkg,
-				(char *)pe->metric_expr,
-				(char *)pe->metric_name,
-				(char *)pe->deprecated);
+				pe);
 	}
 }
 
@@ -864,13 +883,7 @@ static int pmu_add_sys_aliases_iter_fn(struct pmu_event *pe, void *data)
 				      (char *)pe->name,
 				      (char *)pe->desc,
 				      (char *)pe->event,
-				      (char *)pe->long_desc,
-				      (char *)pe->topic,
-				      (char *)pe->unit,
-				      (char *)pe->perpkg,
-				      (char *)pe->metric_expr,
-				      (char *)pe->metric_name,
-				      (char *)pe->deprecated);
+				      pe);
 	}
 
 	return 0;
@@ -942,6 +955,7 @@ static struct perf_pmu *pmu_lookup(const char *name)
 	pmu->is_uncore = pmu_is_uncore(name);
 	if (pmu->is_uncore)
 		pmu->id = pmu_id(name);
+	pmu->is_hybrid = perf_pmu__hybrid_mounted(name);
 	pmu->max_precise = pmu_max_precise(name);
 	pmu_add_cpu_aliases(&aliases, pmu);
 	pmu_add_sys_aliases(&aliases, pmu);
@@ -953,6 +967,9 @@ static struct perf_pmu *pmu_lookup(const char *name)
 	list_splice(&aliases, &pmu->aliases);
 	list_add_tail(&pmu->list, &pmus);
 
+	if (pmu->is_hybrid)
+		list_add_tail(&pmu->hybrid_list, &perf_pmu__hybrid_pmus);
+
 	pmu->default_config = perf_pmu__get_default_config(pmu);
 
 	return pmu;
@@ -1069,7 +1086,7 @@ int perf_pmu__format_type(struct list_head *formats, const char *name)
 
 /*
  * Sets value based on the format definition (format parameter)
- * and unformated value (value parameter).
+ * and unformatted value (value parameter).
  */
 static void pmu_format_value(unsigned long *format, __u64 value, __u64 *v,
 			     bool zero)
@@ -1408,7 +1425,7 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
 	}
 
 	/*
-	 * if no unit or scale foundin aliases, then
+	 * if no unit or scale found in aliases, then
 	 * set defaults as for evsel
 	 * unit cannot left to NULL
 	 */
@@ -1845,3 +1862,13 @@ void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
 		   "'%llx' not supported by kernel)!\n",
 		   name ?: "N/A", buf, config);
 }
+
+bool perf_pmu__has_hybrid(void)
+{
+	if (!hybrid_scanned) {
+		hybrid_scanned = true;
+		perf_pmu__scan(NULL);
+	}
+
+	return !list_empty(&perf_pmu__hybrid_pmus);
+}
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 160b0f561771..a790ef758171 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -5,6 +5,7 @@
 #include <linux/bitmap.h>
 #include <linux/compiler.h>
 #include <linux/perf_event.h>
+#include <linux/list.h>
 #include <stdbool.h>
 #include "parse-events.h"
 #include "pmu-events/pmu-events.h"
@@ -19,6 +20,7 @@ enum {
 
 #define PERF_PMU_FORMAT_BITS 64
 #define EVENT_SOURCE_DEVICE_PATH "/bus/event_source/devices/"
+#define CPUS_TEMPLATE_CPU	"%s/bus/event_source/devices/%s/cpus"
 
 struct perf_event_attr;
 
@@ -34,6 +36,7 @@ struct perf_pmu {
 	__u32 type;
 	bool selectable;
 	bool is_uncore;
+	bool is_hybrid;
 	bool auxtrace;
 	int max_precise;
 	struct perf_event_attr *default_config;
@@ -42,6 +45,7 @@ struct perf_pmu {
 	struct list_head aliases; /* HEAD struct perf_pmu_alias -> list */
 	struct list_head caps;    /* HEAD struct perf_pmu_caps -> list */
 	struct list_head list;    /* ELEM */
+	struct list_head hybrid_list;
 };
 
 extern struct perf_pmu perf_pmu__fake;
@@ -72,6 +76,7 @@ struct perf_pmu_alias {
 	bool deprecated;
 	char *metric_expr;
 	char *metric_name;
+	char *pmu_name;
 };
 
 struct perf_pmu *perf_pmu__find(const char *name);
@@ -114,6 +119,7 @@ void pmu_add_cpu_aliases_map(struct list_head *head, struct perf_pmu *pmu,
 			     struct pmu_events_map *map);
 
 struct pmu_events_map *perf_pmu__find_map(struct perf_pmu *pmu);
+struct pmu_events_map *pmu_events_map__find(void);
 bool pmu_uncore_alias_match(const char *pmu_name, const char *name);
 void perf_pmu_free_alias(struct perf_pmu_alias *alias);
 
@@ -126,4 +132,6 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu);
 void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config,
 				   char *name);
 
+bool perf_pmu__has_hybrid(void);
+
 #endif /* __PMU_H */
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index a9cff3a50ddf..a78c8d59a555 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -3228,7 +3228,7 @@ errout:
 	return err;
 }
 
-/* Concatinate two arrays */
+/* Concatenate two arrays */
 static void *memcat(void *a, size_t sz_a, void *b, size_t sz_b)
 {
 	void *ret;
@@ -3258,7 +3258,7 @@ concat_probe_trace_events(struct probe_trace_event **tevs, int *ntevs,
 	if (*ntevs + ntevs2 > probe_conf.max_probes)
 		ret = -E2BIG;
 	else {
-		/* Concatinate the array of probe_trace_event */
+		/* Concatenate the array of probe_trace_event */
 		new_tevs = memcat(*tevs, (*ntevs) * sizeof(**tevs),
 				  *tevs2, ntevs2 * sizeof(**tevs2));
 		if (!new_tevs)
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 1b118c9c86a6..866f2d514d72 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -164,7 +164,7 @@ static struct probe_trace_arg_ref *alloc_trace_arg_ref(long offs)
 /*
  * Convert a location into trace_arg.
  * If tvar == NULL, this just checks variable can be converted.
- * If fentry == true and vr_die is a parameter, do huristic search
+ * If fentry == true and vr_die is a parameter, do heuristic search
  * for the location fuzzed by function entry mcount.
  */
 static int convert_variable_location(Dwarf_Die *vr_die, Dwarf_Addr addr,
@@ -498,7 +498,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 			       " nor array.\n", varname);
 			return -EINVAL;
 		}
-		/* While prcessing unnamed field, we don't care about this */
+		/* While processing unnamed field, we don't care about this */
 		if (field->ref && dwarf_diename(vr_die)) {
 			pr_err("Semantic error: %s must be referred by '.'\n",
 			       field->name);
@@ -1832,7 +1832,7 @@ static int line_range_walk_cb(const char *fname, int lineno,
 	    (lf->lno_s > lineno || lf->lno_e < lineno))
 		return 0;
 
-	/* Make sure this line can be reversable */
+	/* Make sure this line can be reversible */
 	if (cu_find_lineinfo(&lf->cu_die, addr, &__fname, &__lineno) > 0
 	    && (lineno != __lineno || strcmp(fname, __fname)))
 		return 0;
diff --git a/tools/perf/util/python-ext-sources b/tools/perf/util/python-ext-sources
index 845dd46e3c61..d7c976671e3a 100644
--- a/tools/perf/util/python-ext-sources
+++ b/tools/perf/util/python-ext-sources
@@ -37,3 +37,5 @@ util/units.c
 util/affinity.c
 util/rwsem.c
 util/hashmap.c
+util/pmu-hybrid.c
+util/fncache.c
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index 278abecb5bdf..412f8e79e409 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -90,6 +90,7 @@ int metricgroup__copy_metric_events(struct evlist *evlist, struct cgroup *cgrp,
  */
 void bpf_counter__destroy(struct evsel *evsel);
 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd);
+int bpf_counter__disable(struct evsel *evsel);
 
 void bpf_counter__destroy(struct evsel *evsel __maybe_unused)
 {
@@ -100,6 +101,11 @@ int bpf_counter__install_pe(struct evsel *evsel __maybe_unused, int cpu __maybe_
 	return 0;
 }
 
+int bpf_counter__disable(struct evsel *evsel __maybe_unused)
+{
+	return 0;
+}
+
 /*
  * Support debug printing even though util/debug.c is not linked.  That means
  * implementing 'verbose' and 'eprintf'.
diff --git a/tools/perf/util/s390-cpumsf.c b/tools/perf/util/s390-cpumsf.c
index 078a71773565..8130b56aa04b 100644
--- a/tools/perf/util/s390-cpumsf.c
+++ b/tools/perf/util/s390-cpumsf.c
@@ -45,7 +45,7 @@
  * the data portion is mmap()'ed.
  *
  * To sort the queues in chronological order, all queue access is controlled
- * by the auxtrace_heap. This is basicly a stack, each stack element has two
+ * by the auxtrace_heap. This is basically a stack, each stack element has two
  * entries, the queue number and a time stamp. However the stack is sorted by
  * the time stamps. The highest time stamp is at the bottom the lowest
  * (nearest) time stamp is at the top. That sort order is maintained at all
@@ -65,11 +65,11 @@
  * stamp of the last processed entry of the auxtrace_buffer replaces the
  * current auxtrace_heap top.
  *
- * 3. Auxtrace_queues might run of out data and are feeded by the
+ * 3. Auxtrace_queues might run of out data and are fed by the
  * PERF_RECORD_AUXTRACE handling, see s390_cpumsf_process_auxtrace_event().
  *
  * Event Generation
- * Each sampling-data entry in the auxilary trace data generates a perf sample.
+ * Each sampling-data entry in the auxiliary trace data generates a perf sample.
  * This sample is filled
  * with data from the auxtrace such as PID/TID, instruction address, CPU state,
  * etc. This sample is processed with perf_session__deliver_synth_event() to
@@ -575,7 +575,7 @@ static unsigned long long get_trailer_time(const unsigned char *buf)
  * pointer to the queue, the second parameter is the time stamp. This
  * is the time stamp:
  * - of the event that triggered this processing.
- * - or the time stamp when the last proccesing of this queue stopped.
+ * - or the time stamp when the last processing of this queue stopped.
  *   In this case it stopped at a 4KB page boundary and record the
  *   position on where to continue processing on the next invocation
  *   (see buffer->use_data and buffer->use_size).
@@ -640,7 +640,7 @@ static int s390_cpumsf_samples(struct s390_cpumsf_queue *sfq, u64 *ts)
 			goto out;
 		}
 
-		pos += dsdes;	/* Skip diagnositic entry */
+		pos += dsdes;	/* Skip diagnostic entry */
 
 		/* Check for trailer entry */
 		if (!s390_cpumsf_reached_trailer(bsdes + dsdes, pos)) {
diff --git a/tools/perf/util/s390-sample-raw.c b/tools/perf/util/s390-sample-raw.c
index cfcf8d534d76..08ec3c3ae0ee 100644
--- a/tools/perf/util/s390-sample-raw.c
+++ b/tools/perf/util/s390-sample-raw.c
@@ -160,11 +160,9 @@ static void s390_cpumcfdg_dump(struct perf_sample *sample)
 	const char *color = PERF_COLOR_BLUE;
 	struct cf_ctrset_entry *cep, ce;
 	struct pmu_events_map *map;
-	struct perf_pmu pmu;
 	u64 *p;
 
-	memset(&pmu, 0, sizeof(pmu));
-	map = perf_pmu__find_map(&pmu);
+	map = pmu_events_map__find();
 	while (offset < len) {
 		cep = (struct cf_ctrset_entry *)(buf + offset);
 
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index c83c2c6564e0..4e4aa4c97ac5 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1531,7 +1531,7 @@ static void set_table_handlers(struct tables *tables)
 		 * Attempt to use the call path root from the call return
 		 * processor, if the call return processor is in use. Otherwise,
 		 * we allocate a new call path root. This prevents exporting
-		 * duplicate call path ids when both are in use simultaniously.
+		 * duplicate call path ids when both are in use simultaneously.
 		 */
 		if (tables->dbe.crp)
 			tables->dbe.cpr = tables->dbe.crp->cpr;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 859832a82496..a12cf4f0e97a 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -29,6 +29,7 @@
 #include "thread-stack.h"
 #include "sample-raw.h"
 #include "stat.h"
+#include "tsc.h"
 #include "ui/progress.h"
 #include "../perf.h"
 #include "arch/common.h"
@@ -451,6 +452,16 @@ static int process_stat_round_stub(struct perf_session *perf_session __maybe_unu
 	return 0;
 }
 
+static int process_event_time_conv_stub(struct perf_session *perf_session __maybe_unused,
+					union perf_event *event)
+{
+	if (dump_trace)
+		perf_event__fprintf_time_conv(event, stdout);
+
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
 static int perf_session__process_compressed_event_stub(struct perf_session *session __maybe_unused,
 						       union perf_event *event __maybe_unused,
 						       u64 file_offset __maybe_unused)
@@ -532,7 +543,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 	if (tool->stat_round == NULL)
 		tool->stat_round = process_stat_round_stub;
 	if (tool->time_conv == NULL)
-		tool->time_conv = process_event_op2_stub;
+		tool->time_conv = process_event_time_conv_stub;
 	if (tool->feature == NULL)
 		tool->feature = process_event_op2_stub;
 	if (tool->compressed == NULL)
@@ -949,6 +960,19 @@ static void perf_event__stat_round_swap(union perf_event *event,
 	event->stat_round.time = bswap_64(event->stat_round.time);
 }
 
+static void perf_event__time_conv_swap(union perf_event *event,
+				       bool sample_id_all __maybe_unused)
+{
+	event->time_conv.time_shift = bswap_64(event->time_conv.time_shift);
+	event->time_conv.time_mult  = bswap_64(event->time_conv.time_mult);
+	event->time_conv.time_zero  = bswap_64(event->time_conv.time_zero);
+
+	if (event_contains(event->time_conv, time_cycles)) {
+		event->time_conv.time_cycles = bswap_64(event->time_conv.time_cycles);
+		event->time_conv.time_mask = bswap_64(event->time_conv.time_mask);
+	}
+}
+
 typedef void (*perf_event__swap_op)(union perf_event *event,
 				    bool sample_id_all);
 
@@ -985,7 +1009,7 @@ static perf_event__swap_op perf_event__swap_ops[] = {
 	[PERF_RECORD_STAT]		  = perf_event__stat_swap,
 	[PERF_RECORD_STAT_ROUND]	  = perf_event__stat_round_swap,
 	[PERF_RECORD_EVENT_UPDATE]	  = perf_event__event_update_swap,
-	[PERF_RECORD_TIME_CONV]		  = perf_event__all64_swap,
+	[PERF_RECORD_TIME_CONV]		  = perf_event__time_conv_swap,
 	[PERF_RECORD_HEADER_MAX]	  = NULL,
 };
 
@@ -1069,7 +1093,7 @@ static void callchain__lbr_callstack_printf(struct perf_sample *sample)
 		 * in "to" register.
 		 * For example, there is a call stack
 		 * "A"->"B"->"C"->"D".
-		 * The LBR registers will recorde like
+		 * The LBR registers will be recorded like
 		 * "C"->"D", "B"->"C", "A"->"B".
 		 * So only the first "to" register and all "from"
 		 * registers are needed to construct the whole stack.
@@ -1302,8 +1326,10 @@ static void dump_sample(struct evsel *evsel, union perf_event *event,
 
 	if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
 		printf("... weight: %" PRIu64 "", sample->weight);
-			if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT)
+			if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
 				printf(",0x%"PRIx16"", sample->ins_lat);
+				printf(",0x%"PRIx16"", sample->p_stage_cyc);
+			}
 		printf("\n");
 	}
 
@@ -1584,7 +1610,7 @@ static s64 perf_session__process_user_event(struct perf_session *session,
 		return tool->event_update(tool, event, &session->evlist);
 	case PERF_RECORD_HEADER_EVENT_TYPE:
 		/*
-		 * Depreceated, but we need to handle it for sake
+		 * Deprecated, but we need to handle it for sake
 		 * of old data files create in pipe mode.
 		 */
 		return 0;
@@ -2350,7 +2376,8 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp
 	return machines__fprintf_dsos_buildid(&session->machines, fp, skip, parm);
 }
 
-size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
+size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp,
+				       bool skip_empty)
 {
 	size_t ret;
 	const char *msg = "";
@@ -2360,7 +2387,7 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
 
 	ret = fprintf(fp, "\nAggregated stats:%s\n", msg);
 
-	ret += events_stats__fprintf(&session->evlist->stats, fp);
+	ret += events_stats__fprintf(&session->evlist->stats, fp, skip_empty);
 	return ret;
 }
 
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index f76480166d38..e31ba4c92a6c 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -113,7 +113,8 @@ size_t perf_session__fprintf_dsos(struct perf_session *session, FILE *fp);
 size_t perf_session__fprintf_dsos_buildid(struct perf_session *session, FILE *fp,
 					  bool (fn)(struct dso *dso, int parm), int parm);
 
-size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp);
+size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp,
+				       bool skip_empty);
 
 struct evsel *perf_session__find_first_evtype(struct perf_session *session,
 					    unsigned int type);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 552b590485bf..88ce47f2547e 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -25,6 +25,7 @@
 #include <traceevent/event-parse.h>
 #include "mem-events.h"
 #include "annotate.h"
+#include "event.h"
 #include "time-utils.h"
 #include "cgroup.h"
 #include "machine.h"
@@ -36,7 +37,7 @@ const char	default_parent_pattern[] = "^sys_|^do_page_fault";
 const char	*parent_pattern = default_parent_pattern;
 const char	*default_sort_order = "comm,dso,symbol";
 const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
-const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat";
+const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,p_stage_cyc";
 const char	default_top_sort_order[] = "dso,symbol";
 const char	default_diff_sort_order[] = "dso,symbol";
 const char	default_tracepoint_sort_order[] = "trace";
@@ -45,6 +46,8 @@ const char	*field_order;
 regex_t		ignore_callees_regex;
 int		have_ignore_callees = 0;
 enum sort_mode	sort__mode = SORT_MODE__NORMAL;
+const char	*dynamic_headers[] = {"local_ins_lat", "p_stage_cyc"};
+const char	*arch_specific_sort_keys[] = {"p_stage_cyc"};
 
 /*
  * Replaces all occurrences of a char used with the:
@@ -1408,6 +1411,25 @@ struct sort_entry sort_global_ins_lat = {
 	.se_width_idx	= HISTC_GLOBAL_INS_LAT,
 };
 
+static int64_t
+sort__global_p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return left->stat.p_stage_cyc - right->stat.p_stage_cyc;
+}
+
+static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
+					size_t size, unsigned int width)
+{
+	return repsep_snprintf(bf, size, "%-*u", width, he->stat.p_stage_cyc);
+}
+
+struct sort_entry sort_p_stage_cyc = {
+	.se_header      = "Pipeline Stage Cycle",
+	.se_cmp         = sort__global_p_stage_cyc_cmp,
+	.se_snprintf	= hist_entry__p_stage_cyc_snprintf,
+	.se_width_idx	= HISTC_P_STAGE_CYC,
+};
+
 struct sort_entry sort_mem_daddr_sym = {
 	.se_header	= "Data Symbol",
 	.se_cmp		= sort__daddr_cmp,
@@ -1816,6 +1838,21 @@ struct sort_dimension {
 	int			taken;
 };
 
+int __weak arch_support_sort_key(const char *sort_key __maybe_unused)
+{
+	return 0;
+}
+
+const char * __weak arch_perf_header_entry(const char *se_header)
+{
+	return se_header;
+}
+
+static void sort_dimension_add_dynamic_header(struct sort_dimension *sd)
+{
+	sd->entry->se_header = arch_perf_header_entry(sd->entry->se_header);
+}
+
 #define DIM(d, n, func) [d] = { .name = n, .entry = &(func) }
 
 static struct sort_dimension common_sort_dimensions[] = {
@@ -1841,6 +1878,7 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_CODE_PAGE_SIZE, "code_page_size", sort_code_page_size),
 	DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat),
 	DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat),
+	DIM(SORT_PIPELINE_STAGE_CYC, "p_stage_cyc", sort_p_stage_cyc),
 };
 
 #undef DIM
@@ -2739,7 +2777,20 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 			struct evlist *evlist,
 			int level)
 {
-	unsigned int i;
+	unsigned int i, j;
+
+	/*
+	 * Check to see if there are any arch specific
+	 * sort dimensions not applicable for the current
+	 * architecture. If so, Skip that sort key since
+	 * we don't want to display it in the output fields.
+	 */
+	for (j = 0; j < ARRAY_SIZE(arch_specific_sort_keys); j++) {
+		if (!strcmp(arch_specific_sort_keys[j], tok) &&
+				!arch_support_sort_key(tok)) {
+			return 0;
+		}
+	}
 
 	for (i = 0; i < ARRAY_SIZE(common_sort_dimensions); i++) {
 		struct sort_dimension *sd = &common_sort_dimensions[i];
@@ -2747,6 +2798,11 @@ int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
+		for (j = 0; j < ARRAY_SIZE(dynamic_headers); j++) {
+			if (!strcmp(dynamic_headers[j], sd->name))
+				sort_dimension_add_dynamic_header(sd);
+		}
+
 		if (sd->entry == &sort_parent) {
 			int ret = regcomp(&parent_regex, parent_pattern, REG_EXTENDED);
 			if (ret) {
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 63f67a3f3630..87a092645aa7 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -51,6 +51,7 @@ struct he_stat {
 	u64			period_guest_us;
 	u64			weight;
 	u64			ins_lat;
+	u64			p_stage_cyc;
 	u32			nr_events;
 };
 
@@ -234,6 +235,7 @@ enum sort_type {
 	SORT_CODE_PAGE_SIZE,
 	SORT_LOCAL_INS_LAT,
 	SORT_GLOBAL_INS_LAT,
+	SORT_PIPELINE_STAGE_CYC,
 
 	/* branch stack specific sort keys */
 	__SORT_BRANCH_STACK,
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index 7f09cdaf5b60..a76fff5e7d83 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -17,6 +17,8 @@
 #include "cgroup.h"
 #include <api/fs/fs.h>
 #include "util.h"
+#include "iostat.h"
+#include "pmu-hybrid.h"
 
 #define CNTR_NOT_SUPPORTED	"<not supported>"
 #define CNTR_NOT_COUNTED	"<not counted>"
@@ -310,6 +312,11 @@ static void print_metric_header(struct perf_stat_config *config,
 	struct outstate *os = ctx;
 	char tbuf[1024];
 
+	/* In case of iostat, print metric header for first root port only */
+	if (config->iostat_run &&
+	    os->evsel->priv != os->evsel->evlist->selected->priv)
+		return;
+
 	if (!valid_only_metric(unit))
 		return;
 	unit = fixunit(tbuf, os->evsel, unit);
@@ -439,6 +446,12 @@ static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int
 		if (counter->cgrp)
 			os.nfields++;
 	}
+
+	if (!config->no_csv_summary && config->csv_output &&
+	    config->summary && !config->interval) {
+		fprintf(config->output, "%16s%s", "summary", config->csv_sep);
+	}
+
 	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
 		if (config->metric_only) {
 			pm(config, &os, NULL, "", "", 0);
@@ -526,6 +539,7 @@ static void uniquify_event_name(struct evsel *counter)
 {
 	char *new_name;
 	char *config;
+	int ret = 0;
 
 	if (counter->uniquified_name ||
 	    !counter->pmu_name || !strncmp(counter->name, counter->pmu_name,
@@ -540,8 +554,17 @@ static void uniquify_event_name(struct evsel *counter)
 			counter->name = new_name;
 		}
 	} else {
-		if (asprintf(&new_name,
-			     "%s [%s]", counter->name, counter->pmu_name) > 0) {
+		if (perf_pmu__has_hybrid()) {
+			if (!counter->use_config_name) {
+				ret = asprintf(&new_name, "%s/%s/",
+					       counter->pmu_name, counter->name);
+			}
+		} else {
+			ret = asprintf(&new_name, "%s [%s]",
+				       counter->name, counter->pmu_name);
+		}
+
+		if (ret) {
 			free(counter->name);
 			counter->name = new_name;
 		}
@@ -644,6 +667,9 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
 	if (!collect_data(config, counter, aggr_cb, &ad))
 		return;
 
+	if (perf_pmu__has_hybrid() && ad.ena == 0)
+		return;
+
 	nr = ad.nr;
 	ena = ad.ena;
 	run = ad.run;
@@ -952,8 +978,11 @@ static void print_metric_headers(struct perf_stat_config *config,
 	if (config->csv_output) {
 		if (config->interval)
 			fputs("time,", config->output);
-		fputs(aggr_header_csv[config->aggr_mode], config->output);
+		if (!config->iostat_run)
+			fputs(aggr_header_csv[config->aggr_mode], config->output);
 	}
+	if (config->iostat_run)
+		iostat_print_header_prefix(config);
 
 	/* Print metrics headers only */
 	evlist__for_each_entry(evlist, counter) {
@@ -983,7 +1012,8 @@ static void print_interval(struct perf_stat_config *config,
 	if (config->interval_clear)
 		puts(CONSOLE_CLEAR);
 
-	sprintf(prefix, "%6lu.%09lu%s", (unsigned long) ts->tv_sec, ts->tv_nsec, config->csv_sep);
+	if (!config->iostat_run)
+		sprintf(prefix, "%6lu.%09lu%s", (unsigned long) ts->tv_sec, ts->tv_nsec, config->csv_sep);
 
 	if ((num_print_interval == 0 && !config->csv_output) || config->interval_clear) {
 		switch (config->aggr_mode) {
@@ -1019,9 +1049,11 @@ static void print_interval(struct perf_stat_config *config,
 			break;
 		case AGGR_GLOBAL:
 		default:
-			fprintf(output, "#           time");
-			if (!metric_only)
-				fprintf(output, "             counts %*s events\n", unit_width, "unit");
+			if (!config->iostat_run) {
+				fprintf(output, "#           time");
+				if (!metric_only)
+					fprintf(output, "             counts %*s events\n", unit_width, "unit");
+			}
 		case AGGR_UNSET:
 			break;
 		}
@@ -1214,6 +1246,9 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
 	struct evsel *counter;
 	char buf[64], *prefix = NULL;
 
+	if (config->iostat_run)
+		evlist->selected = evlist__first(evlist);
+
 	if (interval)
 		print_interval(config, evlist, prefix = buf, ts);
 	else
@@ -1226,7 +1261,7 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
 			print_metric_headers(config, evlist, prefix, false);
 		if (num_print_iv++ == 25)
 			num_print_iv = 0;
-		if (config->aggr_mode == AGGR_GLOBAL && prefix)
+		if (config->aggr_mode == AGGR_GLOBAL && prefix && !config->iostat_run)
 			fprintf(config->output, "%s", prefix);
 	}
 
@@ -1243,11 +1278,16 @@ void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *conf
 		}
 		break;
 	case AGGR_GLOBAL:
-		evlist__for_each_entry(evlist, counter) {
-			print_counter_aggr(config, counter, prefix);
+		if (config->iostat_run)
+			iostat_print_counters(evlist, config, ts, prefix = buf,
+					      print_counter_aggr);
+		else {
+			evlist__for_each_entry(evlist, counter) {
+				print_counter_aggr(config, counter, prefix);
+			}
+			if (metric_only)
+				fputc('\n', config->output);
 		}
-		if (metric_only)
-			fputc('\n', config->output);
 		break;
 	case AGGR_NONE:
 		if (metric_only)
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 6ccf21a72f06..39967a45f55b 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -9,7 +9,9 @@
 #include "expr.h"
 #include "metricgroup.h"
 #include "cgroup.h"
+#include "units.h"
 #include <linux/zalloc.h>
+#include "iostat.h"
 
 /*
  * AGGR_GLOBAL: Use CPU 0
@@ -961,7 +963,9 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 	struct metric_event *me;
 	int num = 1;
 
-	if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
+	if (config->iostat_run) {
+		iostat_print_metric(config, evsel, out);
+	} else if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
 		total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd);
 
 		if (total) {
@@ -1270,18 +1274,15 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config,
 		generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL,
 				evsel->name, evsel->metric_name, NULL, 1, cpu, out, st);
 	} else if (runtime_stat_n(st, STAT_NSECS, cpu, &rsd) != 0) {
-		char unit = 'M';
-		char unit_buf[10];
+		char unit = ' ';
+		char unit_buf[10] = "/sec";
 
 		total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd);
-
 		if (total)
-			ratio = 1000.0 * avg / total;
-		if (ratio < 0.001) {
-			ratio *= 1000;
-			unit = 'K';
-		}
-		snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
+			ratio = convert_unit_double(1000000000.0 * avg / total, &unit);
+
+		if (unit != ' ')
+			snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
 		print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio);
 	} else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
 		print_smi_cost(config, cpu, out, st, &rsd);
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index c400f8dde017..2db46b9bebd0 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -76,8 +76,7 @@ double rel_stddev_stats(double stddev, double avg)
 	return pct;
 }
 
-bool __perf_evsel_stat__is(struct evsel *evsel,
-			   enum perf_stat_evsel_id id)
+bool __perf_stat_evsel__is(struct evsel *evsel, enum perf_stat_evsel_id id)
 {
 	struct perf_stat_evsel *ps = evsel->stats;
 
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index d85c292148bb..32c8527de347 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -128,10 +128,12 @@ struct perf_stat_config {
 	bool			 all_user;
 	bool			 percore_show_thread;
 	bool			 summary;
+	bool			 no_csv_summary;
 	bool			 metric_no_group;
 	bool			 metric_no_merge;
 	bool			 stop_read_counter;
 	bool			 quiet;
+	bool			 iostat_run;
 	FILE			*output;
 	unsigned int		 interval;
 	unsigned int		 timeout;
@@ -160,6 +162,7 @@ struct perf_stat_config {
 };
 
 void perf_stat__set_big_num(int set);
+void perf_stat__set_no_csv_summary(int set);
 
 void update_stats(struct stats *stats, u64 val);
 double avg_stats(struct stats *stats);
@@ -187,11 +190,10 @@ struct perf_aggr_thread_value {
 	u64 ena;
 };
 
-bool __perf_evsel_stat__is(struct evsel *evsel,
-			   enum perf_stat_evsel_id id);
+bool __perf_stat_evsel__is(struct evsel *evsel, enum perf_stat_evsel_id id);
 
 #define perf_stat_evsel__is(evsel, id) \
-	__perf_evsel_stat__is(evsel, PERF_STAT_EVSEL_ID__ ## id)
+	__perf_stat_evsel__is(evsel, PERF_STAT_EVSEL_ID__ ## id)
 
 extern struct runtime_stat rt_stat;
 extern struct stats walltime_nsecs_stats;
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h
index ea94d8628980..be94d7046fa0 100644
--- a/tools/perf/util/strbuf.h
+++ b/tools/perf/util/strbuf.h
@@ -12,7 +12,7 @@
  *    build complex strings/buffers whose final size isn't easily known.
  *
  *    It is NOT legal to copy the ->buf pointer away.
- *    `strbuf_detach' is the operation that detachs a buffer from its shell
+ *    `strbuf_detach' is the operation that detaches a buffer from its shell
  *    while keeping the shell valid wrt its invariants.
  *
  * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
diff --git a/tools/perf/util/strfilter.h b/tools/perf/util/strfilter.h
index e0c25a40f796..c05aca9ca582 100644
--- a/tools/perf/util/strfilter.h
+++ b/tools/perf/util/strfilter.h
@@ -8,8 +8,8 @@
 
 /* A node of string filter */
 struct strfilter_node {
-	struct strfilter_node *l;	/* Tree left branche (for &,|) */
-	struct strfilter_node *r;	/* Tree right branche (for !,&,|) */
+	struct strfilter_node *l;	/* Tree left branch (for &,|) */
+	struct strfilter_node *r;	/* Tree right branch (for !,&,|) */
 	const char *p;		/* Operator or rule */
 };
 
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 6dff843fd883..4c56aa837434 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1058,7 +1058,7 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map,
 		curr_dso->symtab_type = dso->symtab_type;
 		maps__insert(kmaps, curr_map);
 		/*
-		 * Add it before we drop the referece to curr_map, i.e. while
+		 * Add it before we drop the reference to curr_map, i.e. while
 		 * we still are sure to have a reference to this DSO via
 		 * *curr_map->dso.
 		 */
diff --git a/tools/perf/util/symbol_fprintf.c b/tools/perf/util/symbol_fprintf.c
index 35c936ce33ef..2664fb65e47a 100644
--- a/tools/perf/util/symbol_fprintf.c
+++ b/tools/perf/util/symbol_fprintf.c
@@ -68,7 +68,7 @@ size_t dso__fprintf_symbols_by_name(struct dso *dso,
 
 	for (nd = rb_first_cached(&dso->symbol_names); nd; nd = rb_next(nd)) {
 		pos = rb_entry(nd, struct symbol_name_rb_node, rb_node);
-		fprintf(fp, "%s\n", pos->sym.name);
+		ret += fprintf(fp, "%s\n", pos->sym.name);
 	}
 
 	return ret;
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index dff178103ce5..35aa0c0f7cd9 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -1211,7 +1211,7 @@ static size_t mask_size(struct perf_cpu_map *map, int *max)
 	*max = 0;
 
 	for (i = 0; i < map->nr; i++) {
-		/* bit possition of the cpu is + 1 */
+		/* bit position of the cpu is + 1 */
 		int bit = map->map[i] + 1;
 
 		if (bit > *max)
@@ -1237,7 +1237,7 @@ void *cpu_map_data__alloc(struct perf_cpu_map *map, size_t *size, u16 *type, int
 	 *   mask  = size of 'struct perf_record_record_cpu_map' +
 	 *           maximum cpu bit converted to size of longs
 	 *
-	 * and finaly + the size of 'struct perf_record_cpu_map_data'.
+	 * and finally + the size of 'struct perf_record_cpu_map_data'.
 	 */
 	size_cpus = cpus_size(map);
 	size_mask = mask_size(map, max);
diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c
index 03bd99d3be16..a2e906858891 100644
--- a/tools/perf/util/syscalltbl.c
+++ b/tools/perf/util/syscalltbl.c
@@ -34,6 +34,10 @@ static const char **syscalltbl_native = syscalltbl_powerpc_32;
 #include <asm/syscalls.c>
 const int syscalltbl_native_max_id = SYSCALLTBL_ARM64_MAX_ID;
 static const char **syscalltbl_native = syscalltbl_arm64;
+#elif defined(__mips__)
+#include <asm/syscalls_n64.c>
+const int syscalltbl_native_max_id = SYSCALLTBL_MIPS_N64_MAX_ID;
+static const char **syscalltbl_native = syscalltbl_mips_n64;
 #endif
 
 struct syscall {
diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
index f132c6c2eef8..4ff56217f2a6 100644
--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -16,6 +16,8 @@ struct target {
 	bool	     uses_mmap;
 	bool	     default_per_cpu;
 	bool	     per_thread;
+	bool	     use_bpf;
+	const char   *attr_map;
 };
 
 enum target_errno {
@@ -64,11 +66,6 @@ static inline bool target__has_cpu(struct target *target)
 	return target->system_wide || target->cpu_list;
 }
 
-static inline bool target__has_bpf(struct target *target)
-{
-	return target->bpf_str;
-}
-
 static inline bool target__none(struct target *target)
 {
 	return !target__has_task(target) && !target__has_cpu(target);
diff --git a/tools/perf/util/thread-stack.h b/tools/perf/util/thread-stack.h
index 3bc47a42af8e..b3cd09beb62f 100644
--- a/tools/perf/util/thread-stack.h
+++ b/tools/perf/util/thread-stack.h
@@ -16,7 +16,6 @@ struct comm;
 struct ip_callchain;
 struct symbol;
 struct dso;
-struct comm;
 struct perf_sample;
 struct addr_location;
 struct call_path;
diff --git a/tools/perf/util/tsc.c b/tools/perf/util/tsc.c
index 62b4c75c966c..f19791d46e99 100644
--- a/tools/perf/util/tsc.c
+++ b/tools/perf/util/tsc.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <errno.h>
+#include <inttypes.h>
+#include <string.h>
 
 #include <linux/compiler.h>
 #include <linux/perf_event.h>
@@ -110,3 +112,31 @@ u64 __weak rdtsc(void)
 {
 	return 0;
 }
+
+size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp)
+{
+	struct perf_record_time_conv *tc = (struct perf_record_time_conv *)event;
+	size_t ret;
+
+	ret  = fprintf(fp, "\n... Time Shift      %" PRI_lu64 "\n", tc->time_shift);
+	ret += fprintf(fp, "... Time Muliplier  %" PRI_lu64 "\n", tc->time_mult);
+	ret += fprintf(fp, "... Time Zero       %" PRI_lu64 "\n", tc->time_zero);
+
+	/*
+	 * The event TIME_CONV was extended for the fields from "time_cycles"
+	 * when supported cap_user_time_short, for backward compatibility,
+	 * prints the extended fields only if they are contained in the event.
+	 */
+	if (event_contains(*tc, time_cycles)) {
+		ret += fprintf(fp, "... Time Cycles     %" PRI_lu64 "\n",
+			       tc->time_cycles);
+		ret += fprintf(fp, "... Time Mask       %#" PRI_lx64 "\n",
+			       tc->time_mask);
+		ret += fprintf(fp, "... Cap Time Zero   %" PRId32 "\n",
+			       tc->cap_user_time_zero);
+		ret += fprintf(fp, "... Cap Time Short  %" PRId32 "\n",
+			       tc->cap_user_time_short);
+	}
+
+	return ret;
+}
diff --git a/tools/perf/util/tsc.h b/tools/perf/util/tsc.h
index 72a15419f3b3..7d83a31732a7 100644
--- a/tools/perf/util/tsc.h
+++ b/tools/perf/util/tsc.h
@@ -4,6 +4,8 @@
 
 #include <linux/types.h>
 
+#include "event.h"
+
 struct perf_tsc_conversion {
 	u16 time_shift;
 	u32 time_mult;
@@ -24,4 +26,6 @@ u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc);
 u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc);
 u64 rdtsc(void);
 
+size_t perf_event__fprintf_time_conv(union perf_event *event, FILE *fp);
+
 #endif // __PERF_TSC_H
diff --git a/tools/perf/util/units.c b/tools/perf/util/units.c
index a46762aec4c9..32c39cfe209b 100644
--- a/tools/perf/util/units.c
+++ b/tools/perf/util/units.c
@@ -33,28 +33,35 @@ unsigned long parse_tag_value(const char *str, struct parse_tag *tags)
 	return (unsigned long) -1;
 }
 
-unsigned long convert_unit(unsigned long value, char *unit)
+double convert_unit_double(double value, char *unit)
 {
 	*unit = ' ';
 
-	if (value > 1000) {
-		value /= 1000;
+	if (value > 1000.0) {
+		value /= 1000.0;
 		*unit = 'K';
 	}
 
-	if (value > 1000) {
-		value /= 1000;
+	if (value > 1000.0) {
+		value /= 1000.0;
 		*unit = 'M';
 	}
 
-	if (value > 1000) {
-		value /= 1000;
+	if (value > 1000.0) {
+		value /= 1000.0;
 		*unit = 'G';
 	}
 
 	return value;
 }
 
+unsigned long convert_unit(unsigned long value, char *unit)
+{
+	double v = convert_unit_double((double)value, unit);
+
+	return (unsigned long)v;
+}
+
 int unit_number__scnprintf(char *buf, size_t size, u64 n)
 {
 	char unit[4] = "BKMG";
diff --git a/tools/perf/util/units.h b/tools/perf/util/units.h
index 99263b6a23f7..ea43e74e3240 100644
--- a/tools/perf/util/units.h
+++ b/tools/perf/util/units.h
@@ -12,6 +12,7 @@ struct parse_tag {
 
 unsigned long parse_tag_value(const char *str, struct parse_tag *tags);
 
+double convert_unit_double(double value, char *unit);
 unsigned long convert_unit(unsigned long value, char *unit);
 int unit_number__scnprintf(char *buf, size_t size, u64 n);
 
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index 9aededc0bc06..71a353349181 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -82,7 +82,7 @@ UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
 #define DW_EH_PE_funcrel	0x40	/* start-of-procedure-relative */
 #define DW_EH_PE_aligned	0x50	/* aligned pointer */
 
-/* Flags intentionaly not handled, since they're not needed:
+/* Flags intentionally not handled, since they're not needed:
  * #define DW_EH_PE_indirect      0x80
  * #define DW_EH_PE_uleb128       0x01
  * #define DW_EH_PE_udata2        0x02