From 4a6306aa6920120b99ade708e7346953424de456 Mon Sep 17 00:00:00 2001 From: Ray Kinsella Date: Wed, 12 Jan 2022 04:47:27 +0000 Subject: perfmon: frontend and backend boundness bundles Renamed memory stalls to topdown backend-bound-mem, added topdown frontend-bound-latency and frontend-bound-bandwidth. Type: improvement Signed-off-by: Ray Kinsella Change-Id: I70f42b6b63fe2502635cad4aed4271e2bbdda5f1 --- src/plugins/perfmon/CMakeLists.txt | 4 +- .../perfmon/intel/bundle/backend_bound_mem.c | 102 +++++++++++++++++++++ .../perfmon/intel/bundle/frontend_bound_bw.c | 90 ++++++++++++++++++ .../perfmon/intel/bundle/frontend_bound_lat.c | 99 ++++++++++++++++++++ src/plugins/perfmon/intel/bundle/memory_stalls.c | 59 ------------ src/plugins/perfmon/intel/core.h | 48 ++++++++-- 6 files changed, 334 insertions(+), 68 deletions(-) create mode 100644 src/plugins/perfmon/intel/bundle/backend_bound_mem.c create mode 100644 src/plugins/perfmon/intel/bundle/frontend_bound_bw.c create mode 100644 src/plugins/perfmon/intel/bundle/frontend_bound_lat.c delete mode 100644 src/plugins/perfmon/intel/bundle/memory_stalls.c (limited to 'src/plugins/perfmon') diff --git a/src/plugins/perfmon/CMakeLists.txt b/src/plugins/perfmon/CMakeLists.txt index 44be59ef539..e262984b610 100644 --- a/src/plugins/perfmon/CMakeLists.txt +++ b/src/plugins/perfmon/CMakeLists.txt @@ -23,7 +23,7 @@ add_vpp_plugin(perfmon perfmon.c intel/core.c intel/uncore.c - intel/bundle/memory_stalls.c + intel/bundle/backend_bound_mem.c intel/bundle/inst_and_clock.c intel/bundle/load_blocks.c intel/bundle/mem_bw.c @@ -32,6 +32,8 @@ add_vpp_plugin(perfmon intel/bundle/power_license.c intel/bundle/topdown_metrics.c intel/bundle/topdown_tremont.c + intel/bundle/frontend_bound_bw.c + intel/bundle/frontend_bound_lat.c intel/bundle/iio_bw.c COMPONENT diff --git a/src/plugins/perfmon/intel/bundle/backend_bound_mem.c b/src/plugins/perfmon/intel/bundle/backend_bound_mem.c new file mode 100644 index 00000000000..ccf1ed12153 --- /dev/null +++ b/src/plugins/perfmon/intel/bundle/backend_bound_mem.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2021 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +enum +{ + STALLS_L1D_MISS = 0, + STALLS_L2_MISS = 1, + STALLS_L3_MISS = 2, + STALLS_MEM_ANY = 3, + STALLS_TOTAL = 4, + BOUND_ON_STORES = 5, + FB_FULL = 6, + THREAD = 7, +}; + +static u8 * +format_intel_backend_bound_mem (u8 *s, va_list *args) +{ + perfmon_node_stats_t *ss = va_arg (*args, perfmon_node_stats_t *); + int row = va_arg (*args, int); + f64 sv = 0; + + if (!ss->n_packets) + return s; + + if (0 == row) + { + sv = ss->value[THREAD] / ss->n_packets; + + s = format (s, "%.0f", sv); + return s; + } + + switch (row) + { + case 1: + sv = ss->value[BOUND_ON_STORES]; + break; + case 2: + sv = ss->value[STALLS_MEM_ANY] - ss->value[STALLS_L1D_MISS]; + break; + case 3: + sv = ss->value[FB_FULL]; + break; + case 4: + sv = ss->value[STALLS_L1D_MISS] - ss->value[STALLS_L2_MISS]; + break; + case 5: + sv = ss->value[STALLS_L2_MISS] - ss->value[STALLS_L3_MISS]; + break; + case 6: + sv = ss->value[STALLS_L3_MISS]; + break; + } + + sv = clib_max ((sv / ss->value[THREAD]) * 100, 0); + + s = format (s, "%04.1f", sv); + + return s; +} + +static perfmon_cpu_supports_t backend_bound_mem_cpu_supports[] = { + { clib_cpu_supports_avx512_bitalg, PERFMON_BUNDLE_TYPE_NODE }, +}; + +PERFMON_REGISTER_BUNDLE (intel_core_backend_bound_mem) = { + .name = "td-backend-mem", + .description = "Topdown BackEnd-bound Memory - % cycles not retiring " + "instructions due to memory stalls", + .source = "intel-core", + .events[0] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_L1D_MISS, /* 0x0F */ + .events[1] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_L2_MISS, /* 0x0F */ + .events[2] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_L3_MISS, /* 0x0F */ + .events[3] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_MEM_ANY, /* 0xFF */ + .events[4] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_TOTAL, /* 0xFF */ + .events[5] = INTEL_CORE_E_EXE_ACTIVITY_BOUND_ON_STORES, /* 0xFF */ + .events[6] = INTEL_CORE_E_L1D_PEND_MISS_FB_FULL, /* 0x0F */ + .events[7] = INTEL_CORE_E_CPU_CLK_UNHALTED_THREAD_P, /* 0xFF */ + .n_events = 8, + .format_fn = format_intel_backend_bound_mem, + .cpu_supports = backend_bound_mem_cpu_supports, + .n_cpu_supports = ARRAY_LEN (backend_bound_mem_cpu_supports), + .column_headers = PERFMON_STRINGS ("Clocks/Packet", "%Store Bound", + "%L1 Bound", "%FB Full", "%L2 Bound", + "%L3 Bound", "%DRAM Bound"), +}; diff --git a/src/plugins/perfmon/intel/bundle/frontend_bound_bw.c b/src/plugins/perfmon/intel/bundle/frontend_bound_bw.c new file mode 100644 index 00000000000..5e5835a7868 --- /dev/null +++ b/src/plugins/perfmon/intel/bundle/frontend_bound_bw.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +enum +{ + DSB_UOPS, + MS_UOPS, + MITE_UOPS, + LSD_UOPS, +}; + +static u8 * +format_intel_frontend_bound_bw (u8 *s, va_list *args) +{ + perfmon_node_stats_t *ss = va_arg (*args, perfmon_node_stats_t *); + int row = va_arg (*args, int); + f64 sv = 0; + f64 uops = ss->value[DSB_UOPS] + ss->value[MS_UOPS] + ss->value[MITE_UOPS] + + ss->value[LSD_UOPS]; + + if (!ss->n_packets) + return s; + + if (row == 0) + { + sv = uops / ss->n_packets; + s = format (s, "%.0f", sv); + + return s; + } + + switch (row) + { + case 1: + sv = (ss->value[DSB_UOPS] / uops) * 100; + break; + case 2: + sv = (ss->value[MS_UOPS] / uops) * 100; + break; + case 3: + sv = (ss->value[MITE_UOPS] / uops) * 100; + break; + case 4: + sv = (ss->value[LSD_UOPS] / uops) * 100; + break; + } + + s = format (s, "%04.1f", sv); + + return s; +} + +static perfmon_cpu_supports_t frontend_bound_bw_cpu_supports[] = { + { clib_cpu_supports_avx512_bitalg, PERFMON_BUNDLE_TYPE_NODE }, +}; + +PERFMON_REGISTER_BUNDLE (intel_core_frontend_bound_bw) = { + .name = "td-frontend-bw", + .description = + "Topdown FrontEnd-bound BandWidth - % uops from each uop fetch source", + .source = "intel-core", + .events[0] = INTEL_CORE_E_IDQ_DSB_UOPS, /* 0x0F */ + .events[1] = INTEL_CORE_E_IDQ_MS_UOPS, /* 0x0F */ + .events[2] = INTEL_CORE_E_IDQ_MITE_UOPS, /* 0x0F */ + .events[3] = INTEL_CORE_E_LSD_UOPS, /* 0x0F */ + .n_events = 4, + .format_fn = format_intel_frontend_bound_bw, + .cpu_supports = frontend_bound_bw_cpu_supports, + .n_cpu_supports = ARRAY_LEN (frontend_bound_bw_cpu_supports), + .column_headers = PERFMON_STRINGS ("UOPs/PKT", "% DSB UOPS", "% MS UOPS", + "% MITE UOPS", "% LSD UOPS"), + .footer = + "For more information, see the Intel(R) 64 and IA-32 Architectures\n" + "Optimization Reference Manual section on the Front End.", +}; diff --git a/src/plugins/perfmon/intel/bundle/frontend_bound_lat.c b/src/plugins/perfmon/intel/bundle/frontend_bound_lat.c new file mode 100644 index 00000000000..aea2149663f --- /dev/null +++ b/src/plugins/perfmon/intel/bundle/frontend_bound_lat.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022 Intel and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +static const int MS_Switches_Cost = 3; +static const int BA_Clear_Cost = 10; + +enum +{ + ICACHE_MISS, + DSB_SWITCHES, + RESTEER, + MS_SWITCHES, + BACLEARS, + THREAD, +}; + +static u8 * +format_intel_frontend_bound_lat (u8 *s, va_list *args) +{ + perfmon_node_stats_t *ss = va_arg (*args, perfmon_node_stats_t *); + int row = va_arg (*args, int); + f64 sv = 0; + f64 cycles = ss->value[THREAD]; + + if (!ss->n_packets) + return s; + + if (!row) + { + sv = ss->value[THREAD] / ss->n_packets; + + s = format (s, "%.0f", sv); + + return s; + } + + switch (row) + { + case 1: + sv = ss->value[ICACHE_MISS] / cycles; + break; + case 2: + sv = ss->value[DSB_SWITCHES] / cycles; + break; + case 3: + sv = + (ss->value[RESTEER] + (ss->value[BACLEARS] * BA_Clear_Cost)) / cycles; + break; + case 4: + sv = (ss->value[MS_SWITCHES] * MS_Switches_Cost) / cycles; + break; + } + + s = format (s, "%04.1f", sv * 100); + + return s; +} + +static perfmon_cpu_supports_t frontend_bound_lat_cpu_supports[] = { + { clib_cpu_supports_avx512_bitalg, PERFMON_BUNDLE_TYPE_NODE }, +}; + +PERFMON_REGISTER_BUNDLE (intel_core_frontend_bound_lat) = { + .name = "td-frontend-lat", + .description = "Topdown FrontEnd-bound Latency - % cycles not retiring uops " + "due to frontend latency", + .source = "intel-core", + .events[0] = INTEL_CORE_E_ICACHE_16B_IFDATA_STALL, /* 0x0F */ + .events[1] = INTEL_CORE_E_DSB2MITE_SWITCHES_PENALTY_CYCLES, /* 0x0F */ + .events[2] = INTEL_CORE_E_INT_MISC_CLEAR_RESTEER_CYCLES, /* 0xFF */ + .events[3] = INTEL_CORE_E_IDQ_MS_SWITCHES, /* 0x0F */ + .events[4] = INTEL_CORE_E_BACLEARS_ANY, /* 0x0F */ + .events[5] = INTEL_CORE_E_CPU_CLK_UNHALTED_THREAD_P, /* FIXED */ + .n_events = 6, + .format_fn = format_intel_frontend_bound_lat, + .cpu_supports = frontend_bound_lat_cpu_supports, + .n_cpu_supports = ARRAY_LEN (frontend_bound_lat_cpu_supports), + .column_headers = PERFMON_STRINGS ("Clocks/Packet", "% iCache Miss", + "% DSB Switch", "% Branch Resteer", + "% MS Switch"), + .footer = + "For more information, see the Intel(R) 64 and IA-32 Architectures\n" + "Optimization Reference Manual on the Front End.", +}; diff --git a/src/plugins/perfmon/intel/bundle/memory_stalls.c b/src/plugins/perfmon/intel/bundle/memory_stalls.c deleted file mode 100644 index 3de3a615732..00000000000 --- a/src/plugins/perfmon/intel/bundle/memory_stalls.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2021 Intel and/or its affiliates. - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at: - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -static u8 * -format_intel_memory_stalls (u8 *s, va_list *args) -{ - perfmon_node_stats_t *ss = va_arg (*args, perfmon_node_stats_t *); - int row = va_arg (*args, int); - f64 sv = 0; - - if (!ss->n_packets) - return s; - - sv = ss->value[row] / ss->n_packets; - - s = format (s, "%5.0f", sv); - - return s; -} - -static perfmon_cpu_supports_t memory_stalls_cpu_supports[] = { - { clib_cpu_supports_avx512_bitalg, PERFMON_BUNDLE_TYPE_NODE }, -}; - -PERFMON_REGISTER_BUNDLE (intel_core_memory_stalls) = { - .name = "memory-stalls", - .description = "cycles not retiring instructions due to memory stalls", - .source = "intel-core", - .events[0] = INTEL_CORE_E_CPU_CLK_UNHALTED_THREAD_P, /* FIXED */ - .events[1] = INTEL_CORE_E_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE, /*CMask: 0xFF*/ - .events[2] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_MEM_ANY, /*CMask: 0xFF*/ - .events[3] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_L1D_MISS, /*CMask: 0xF*/ - .events[4] = INTEL_CORE_E_L1D_PEND_MISS_FB_FULL, /*CMask: 0xF*/ - .events[5] = INTEL_CORE_E_CYCLE_ACTIVITY_STALLS_L3_MISS, /*CMask: 0xF*/ - .events[6] = INTEL_CORE_E_SQ_MISC_SQ_FULL, /*CMask: 0xF*/ - .n_events = 7, - .format_fn = format_intel_memory_stalls, - .cpu_supports = memory_stalls_cpu_supports, - .n_cpu_supports = ARRAY_LEN (memory_stalls_cpu_supports), - .column_headers = PERFMON_STRINGS ("Cycles/Packet", "Cycles Stall/Packet", - "Mem Stall/Packet", - "L1D Miss Stall/Packet", "FB Full/Packet", - "L3 Miss Stall/Packet", "SQ Full/Packet"), -}; diff --git a/src/plugins/perfmon/intel/core.h b/src/plugins/perfmon/intel/core.h index 320d09fd7c5..31daf273517 100644 --- a/src/plugins/perfmon/intel/core.h +++ b/src/plugins/perfmon/intel/core.h @@ -89,6 +89,12 @@ _ (0x0D, 0x01, 0, 0, 0, 0x00, INT_MISC, RECOVERY_CYCLES, \ "Core cycles the allocator was stalled due to recovery from earlier " \ "clear event for this thread (e.g. misprediction or memory nuke)") \ + _ (0x0D, 0x10, 0, 0, 0, 0x00, INT_MISC, UOP_DROPPING, \ + "Estimated number of Top-down Microarchitecture Analysis slots that got" \ + " due to non front-end reasons") \ + _ (0x0D, 0x80, 0, 0, 0, 0x00, INT_MISC, CLEAR_RESTEER_CYCLES, \ + "Counts cycles after recovery from a branch misprediction or machine" \ + "clear till the first uop is issued from the resteered path.") \ _ (0x0E, 0x01, 0, 0, 0, 0x00, UOPS_ISSUED, ANY, \ "Uops that Resource Allocation Table (RAT) issues to Reservation " \ "Station (RS)") \ @@ -123,9 +129,23 @@ _ (0x51, 0x01, 0, 0, 0, 0x00, L1D, REPLACEMENT, \ "L1D data line replacements") \ _ (0x51, 0x04, 0, 0, 0, 0x00, L1D, M_EVICT, "L1D data line evictions") \ - _ (0x83, 0x02, 0, 0, 0, 0x00, ICACHE_64B, IFTAG_MISS, \ - "Instruction fetch tag lookups that miss in the instruction cache " \ - "(L1I). Counts at 64-byte cache-line granularity.") \ + _ (0x79, 0x04, 0, 0, 0, 0x00, IDQ, MITE_UOPS, \ + "Counts the number of uops delivered to Instruction Decode Queue (IDQ) " \ + "from the MITE path.") \ + _ (0x79, 0x08, 0, 0, 0, 0x00, IDQ, DSB_UOPS, \ + "Counts the number of uops delivered to Instruction Decode Queue (IDQ) " \ + "from the Decode Stream Buffer (DSB) path.") \ + _ (0x79, 0x30, 0, 0, 0, 0x00, IDQ, MS_UOPS, \ + "Counts the number of uops delivered to Instruction Decode Queue (IDQ) " \ + "from the Microcode Sequencer (MS) path.") \ + _ (0x79, 0x30, 1, 0, 0, 0x01, IDQ, MS_SWITCHES, \ + "Number of switches from DSB or MITE to the MS") \ + _ ( \ + 0x80, 0x04, 0, 0, 0, 0x00, ICACHE_16B, IFDATA_STALL, \ + "Cycles where a code fetch is stalled due to L1 instruction cache miss.") \ + _ (0x83, 0x04, 0, 0, 0, 0x00, ICACHE_64B, IFTAG_STALL, \ + "Cycles where a code fetch is stalled due to L1 instruction cache tag " \ + "miss.") \ _ (0x9C, 0x01, 0, 0, 0, 0x00, IDQ_UOPS_NOT_DELIVERED, CORE, \ "Uops not delivered to Resource Allocation Table (RAT) per thread when " \ "backend of the machine is not stalled") \ @@ -134,9 +154,8 @@ "full. This counts cycles that the pipeline back-end blocked uop " \ "delivery" \ "from the front-end.") \ - _ (0xA3, 0x04, 0, 0, 0, 0x04, CYCLE_ACTIVITY, CYCLES_NO_EXECUTE, \ - "This event counts cycles during which no instructions were executed in" \ - " the execution stage of the pipeline.") \ + _ (0xA3, 0x04, 0, 0, 0, 0x04, CYCLE_ACTIVITY, STALLS_TOTAL, \ + "Total execution stalls.") \ _ (0xA3, 0x05, 0, 0, 0, 0x05, CYCLE_ACTIVITY, STALLS_L2_MISS, \ "Execution stalls while L2 cache miss demand load is outstanding") \ _ (0xA3, 0x06, 0, 0, 0, 0x06, CYCLE_ACTIVITY, STALLS_L3_MISS, \ @@ -145,6 +164,17 @@ "Execution stalls while L1 cache miss demand load is outstanding") \ _ (0xA3, 0x14, 0, 0, 0, 0x14, CYCLE_ACTIVITY, STALLS_MEM_ANY, \ "Execution stalls while memory subsystem has an outstanding load.") \ + _ (0xA6, 0x40, 0, 0, 0, 0x02, EXE_ACTIVITY, BOUND_ON_STORES, \ + "Cycles where the Store Buffer was full and no loads caused an " \ + "execution stall.") \ + _ (0xA8, 0x01, 0, 0, 0, 0x00, LSD, UOPS, \ + "Counts the number of uops delivered to the back-end by the LSD" \ + "(Loop Stream Detector)") \ + _ (0xAB, 0x02, 0, 0, 0, 0x00, DSB2MITE_SWITCHES, PENALTY_CYCLES, \ + "This event counts fetch penalty cycles when a transition occurs from" \ + "DSB to MITE.") \ + _ (0xB1, 0x01, 0, 0, 0, 0x00, UOPS_EXECUTED, THREAD, \ + "Counts the number of uops to be executed per-thread each cycle.") \ _ (0xC0, 0x00, 0, 0, 0, 0x00, INST_RETIRED, ANY_P, \ "Number of instructions retired. General Counter - architectural event") \ _ (0xC2, 0x02, 0, 0, 0, 0x00, UOPS_RETIRED, RETIRE_SLOTS, \ @@ -155,8 +185,6 @@ "All mispredicted macro branch instructions retired.") \ _ (0xC4, 0x20, 0, 0, 0, 0x00, BR_INST_RETIRED, NEAR_TAKEN, \ "Taken branch instructions retired.") \ - _ (0xD0, 0x81, 0, 0, 0, 0x00, MEM_INST_RETIRED, ALL_LOADS, \ - "All retired load instructions.") \ _ (0xD0, 0x82, 0, 0, 0, 0x00, MEM_INST_RETIRED, ALL_STORES, \ "All retired store instructions.") \ _ (0xD1, 0x01, 0, 0, 0, 0x00, MEM_LOAD_RETIRED, L1_HIT, \ @@ -198,6 +226,10 @@ _ (0xD3, 0x08, 0, 0, 0, 0x00, MEM_LOAD_L3_MISS_RETIRED, REMOTE_FWD, \ "Retired load instructions whose data sources was forwarded from a " \ "remote cache") \ + _ (0xE6, 0x01, 0, 0, 0, 0x00, BACLEARS, ANY, \ + "Counts the total number when the front end is resteered, mainly when " \ + "the BPU cannot provide a correct prediction and this is corrected by " \ + "other branch handling mechanisms at the front end.") \ _ (0xF0, 0x40, 0, 0, 0, 0x00, L2_TRANS, L2_WB, \ "L2 writebacks that access L2 cache") \ _ (0xF1, 0x1F, 0, 0, 0, 0x00, L2_LINES_IN, ALL, \ -- cgit 1.2.3-korg