summaryrefslogtreecommitdiffstats
path: root/src/plugins/perfmon/arm/events.h
diff options
context:
space:
mode:
authorZachary Leaf <zachary.leaf@arm.com>2022-05-12 02:26:00 -0500
committerDamjan Marion <dmarion@0xa5.net>2022-07-12 15:29:23 +0000
commit268d7be66b8b48a230e06de645e3a8b7de29d93c (patch)
tree098d15e0c3f6fa7864c3f581367d8d5bb80495fa /src/plugins/perfmon/arm/events.h
parentc7d43a5eb19f2acab900274432cfd0e136d6cb44 (diff)
perfmon: enable perfmon plugin for Arm
This patch enables statistics from the Arm PMUv3 through the perfmon plugin. In comparison to using the Linux "perf" tool, it allows obtaining direct, per node level statistics (rather than per thread). By accessing the PMU counter registers directly from userspace, we can avoid the overhead of using a read() system call and get more accurate and fine grained statistics about the running of individual nodes. A demo of perfmon on Arm can be found at: https://asciinema.org/a/egVNN1OF7JEKHYmfl5bpDYxfF *Important Note* Perfmon on Arm is dependent on and works only on Linux kernel versions of v5.17+ as this is when userspace access to Arm perf counters was included. On most Arm systems, a maximum of 7 PMU events can be configured at once - (6x PMU events + 1x CPU_CYCLE counter). If some perf counters are in use elsewhere by other applications, and there are insufficient counters remaining to open the bundle, the perf_event_open call will fail (provided the events are grouped with the group_fd param, which perfmon currently utilises). See arm/events.h for a list of PMUv3 events available, although it is implementation defined whether most events are implemented or not. Only a small set of 7 events is required to be implemented in Armv8.0, with some additional events required in later versions. As such, depending on the implementation, some statistics may not be available. See Arm Architecture Reference Manual for Armv8-A, D7.10.2 "The PMU event number space and common events" for more information. arm/events.c:arm_init() gets information from the sysfs about what events are implemented on a particular CPU at runtime. Arm's implementation of the perfmon source callback .bundle_support uses this information to disable unsupported events in a bundle, or in the case no events are supported, disable the entire bundle. Where a particular event in a bundle is not implemented, the statistic for that event is shown as '-' in the 'show perfmon statistics' cli output, by disabling the column. There is additional code in perfmon.c to only open events which are marked as implemented. Since we're only opening and reading events that are implemented, some extra logic is required in cli.c to re-align either perfmon_node_stats_t or perfmon_reading_t with the column headings configured in each bundle, taking into account disabled columns. Userspace access to perf counters is disabled by default, and needs to be enabled with 'sudo sysctl kernel/perf_user_access=1'. There is a check built into the Arm event source init function (arm/events.c:arm_init) to check that userspace reading of perf counters is enabled in the /proc/sys/kernel/perf_user_access file. If the above file does not exist, it means the kernel version is unsupported. Users without a supported kernel will see a warning message, and no Arm bundles will be registered to use in perfmon. Enabling/using plugin: - include the following in startup.conf: - plugins { plugin perfmon_plugin.so { enable } - 'show perfmon bundle [verbose]' - show available statistics bundles - 'perfmon start bundle <bundle-name>' - enable and start logging - 'perfmon stop' - stop logging - 'show perfmon statistics' - show output For a general guide on using and understanding Arm PMUv3 events, see https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/arm-neoverse-n1-performance-analysis-methodology Type: feature Signed-off-by: Zachary Leaf <zachary.leaf@arm.com> Tested-by: Jieqiang Wang <jieqiang.wang@arm.com> Change-Id: I0620fe5b1bbe78842dfb1d0b6a060bb99e777651
Diffstat (limited to 'src/plugins/perfmon/arm/events.h')
-rw-r--r--src/plugins/perfmon/arm/events.h130
1 files changed, 130 insertions, 0 deletions
diff --git a/src/plugins/perfmon/arm/events.h b/src/plugins/perfmon/arm/events.h
new file mode 100644
index 00000000000..5b7c49801d0
--- /dev/null
+++ b/src/plugins/perfmon/arm/events.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2022 Arm and/or its affiliates.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __perfmon_arm_h
+#define __perfmon_arm_h
+
+/*
+ * Events from the Armv8 PMUv3 - See "Arm Architecture Reference Manual Armv8,
+ * for Armv8-A architecture profile" D7.10 PMU events and event numbers:
+ * https://developer.arm.com/documentation/ddi0487/latest/
+ * EventCode, name, description
+ */
+#define foreach_perf_arm_event \
+ _ (0x0D, BR_IMMED_RETIRED, "Immediate branch architecturally executed") \
+ _ (0x10, BR_MIS_PRED, \
+ "Mispredicted or not predicted branch Speculatively executed") \
+ _ (0x22, BR_MIS_PRED_RETIRED, \
+ "Instruction architecturally executed, mispredicted branch") \
+ _ (0x12, BR_PRED, "Predictable branch Speculatively executed") \
+ _ (0x21, BR_RETIRED, "Branch instruction architecturally executed") \
+ _ (0x0E, BR_RETURN_RETIRED, \
+ "Function return instruction architecturally executed and the " \
+ "condition code check pass") \
+ _ (0x19, BUS_ACCESS, "Attributable Bus access") \
+ _ (0x1D, BUS_CYCLES, "Bus cycle") \
+ _ (0x1E, CHAIN, \
+ "For an odd numbered counter, increment when an overflow occurs on" \
+ "the preceding even-numbered counter on the same PE") \
+ _ (0x0B, CID_WRITE_RETIRED, \
+ "Instruction architecturally executed, Condition code check pass, " \
+ "write to CONTEXTIDR") \
+ _ (0x11, CPU_CYCLES, "Cycle counter") \
+ _ (0x34, DTLB_WALK, \
+ "Access to data or unified TLB causes a translation table walk") \
+ _ (0x0A, EXC_RETURN, \
+ "Exception return instruction architecturally executed and the " \
+ "condition code check pass") \
+ _ (0x09, EXC_TAKEN, "Exception entry") \
+ _ (0x08, INST_RETIRED, "Instruction architecturally executed") \
+ _ (0x1B, INST_SPEC, "Operation Speculatively executed") \
+ _ (0x35, ITLB_WALK, \
+ "Access to instruction TLB that causes a translation table walk") \
+ _ (0x04, L1D_CACHE, "Level 1 data cache access") \
+ _ (0x1F, L1D_CACHE_ALLOCATE, \
+ "Level 1 data cache allocation without refill") \
+ _ (0x39, L1D_CACHE_LMISS_RD, "Level 1 data cache long-latency read miss") \
+ _ (0x03, L1D_CACHE_REFILL, "Level 1 data cache refill") \
+ _ (0x15, L1D_CACHE_WB, "Attributable Level 1 data cache write-back") \
+ _ (0x25, L1D_TLB, "Level 1 data or unified TLB access") \
+ _ (0x05, L1D_TLB_REFILL, "Level 1 data or unified TLB refill") \
+ _ (0x14, L1I_CACHE, "Level 1 instruction cache access") \
+ _ (0x01, L1I_CACHE_REFILL, "Level 1 instruction cache refill") \
+ _ (0x26, L1I_TLB, "Level 1 instruction TLB access") \
+ _ (0x02, L1I_TLB_REFILL, "Level 1 instruction TLB refill") \
+ _ (0x16, L2D_CACHE, "Level 2 data cache access") \
+ _ (0x20, L2D_CACHE_ALLOCATE, \
+ "Level 2 data cache allocation without refill") \
+ _ (0x17, L2D_CACHE_REFILL, "Level 2 data cache refill") \
+ _ (0x18, L2D_CACHE_WB, "Attributable Level 2 data cache write-back") \
+ _ (0x2F, L2D_TLB, "Level 2 data or unified TLB access") \
+ _ (0x2D, L2D_TLB_REFILL, "Level 2 data or unified TLB refill") \
+ _ (0x27, L2I_CACHE, "Level 2 instruction cache access") \
+ _ (0x28, L2I_CACHE_REFILL, "Attributable Level 2 instruction cache refill") \
+ _ (0x30, L2I_TLB, "Level 2 instruction TLB access") \
+ _ (0x2E, L2I_TLB_REFILL, "Level 2 instruction TLB refill") \
+ _ (0x2B, L3D_CACHE, "Level 3 data cache access") \
+ _ (0x29, L3D_CACHE_ALLOCATE, \
+ "Level 3 data cache allocation without refill") \
+ _ (0x2A, L3D_CACHE_REFILL, "Attributable Level 3 data cache refill") \
+ _ (0x2C, L3D_CACHE_WB, "Attributable Level 3 data cache write-back") \
+ _ (0x06, LD_RETIRED, \
+ "Memory-reading instruction architecturally executed and condition" \
+ " code check pass") \
+ _ (0x32, LL_CACHE, "Last Level cache access") \
+ _ (0x33, LL_CACHE_MISS, "Last Level cache miss") \
+ _ (0x37, LL_CACHE_MISS_RD, "Last level cache miss, read") \
+ _ (0x36, LL_CACHE_RD, "Last level data cache access, read") \
+ _ (0x1A, MEMORY_ERROR, "Local memory error") \
+ _ (0x13, MEM_ACCESS, "Data memory access") \
+ _ (0x3A, OP_RETIRED, "Micro-operation architecturally executed") \
+ _ (0x3B, OP_SPEC, "Micro-operation Speculatively executed") \
+ _ (0x0C, PC_WRITE_RETIRED, \
+ "Software change to the Program Counter (PC). Instruction is " \
+ "architecturally executed and condition code check pass") \
+ _ (0x31, REMOTE_ACCESS, \
+ "Access to another socket in a multi-socket system") \
+ _ (0x38, REMOTE_ACCESS_RD, \
+ "Access to another socket in a multi-socket system, read") \
+ _ (0x3C, STALL, "No operation sent for execution") \
+ _ (0x24, STALL_BACKEND, "No operation issued due to the backend") \
+ _ (0x23, STALL_FRONTEND, "No operation issued due to the frontend") \
+ _ (0x3F, STALL_SLOT, "No operation sent for execution on a Slot") \
+ _ (0x3D, STALL_SLOT_BACKEND, \
+ "No operation sent for execution on a Slot due to the backend") \
+ _ (0x3E, STALL_SLOT_FRONTEND, \
+ "No operation sent for execution on a Slot due to the frontend") \
+ _ (0x07, ST_RETIRED, \
+ "Memory-writing instruction architecturally executed and condition" \
+ " code check pass") \
+ _ (0x00, SW_INCR, \
+ "Instruction architecturally executed, Condition code check pass, " \
+ "software increment") \
+ _ (0x1C, TTBR_WRITE_RETIRED, \
+ "Instruction architecturally executed, Condition code check pass, " \
+ "write to TTBR") \
+ _ (0x0F, UNALIGNED_LDST_RETIRED, \
+ "Unaligned memory memory-reading or memory-writing instruction " \
+ "architecturally executed and condition code check pass")
+
+typedef enum
+{
+#define _(event, n, desc) ARMV8_PMUV3_##n,
+ foreach_perf_arm_event
+#undef _
+ ARM_N_EVENTS,
+} perf_arm_event_t;
+
+#endif