diff options
author | Viliam Luc <vluc@cisco.com> | 2022-04-13 14:00:44 +0200 |
---|---|---|
committer | Viliam Luc <vluc@cisco.com> | 2022-09-28 10:28:03 +0000 |
commit | a2182abd2665aa9264464a99ad77718e2c7bbe18 (patch) | |
tree | b6552e130c503c0694167ca7485711e776fa2b79 /resources | |
parent | bff439b69ee71b654b1da92564ff62de7327fe71 (diff) |
telemetry: linux telemetry with perf-stat
Signed-off-by: Viliam Luc <vluc@cisco.com>
Change-Id: I17ced17a309cc0ac21c5fc94e570c89a456339e2
Diffstat (limited to 'resources')
-rw-r--r-- | resources/libraries/python/TelemetryUtil.py | 6 | ||||
-rw-r--r-- | resources/libraries/robot/performance/performance_actions.robot | 72 | ||||
-rw-r--r-- | resources/templates/telemetry/bpf_runtime.yaml | 673 | ||||
-rw-r--r-- | resources/templates/telemetry/perf_stat_runtime.yaml | 142 | ||||
-rwxr-xr-x | resources/tools/telemetry/__main__.py | 4 | ||||
-rw-r--r-- | resources/tools/telemetry/bundle_bpf.py | 19 | ||||
-rw-r--r-- | resources/tools/telemetry/bundle_perf_stat.py | 109 | ||||
-rw-r--r-- | resources/tools/telemetry/constants.py | 5 | ||||
-rw-r--r-- | resources/tools/telemetry/metrics.py | 4 | ||||
-rw-r--r-- | resources/tools/telemetry/serializer.py | 2 |
10 files changed, 997 insertions, 39 deletions
diff --git a/resources/libraries/python/TelemetryUtil.py b/resources/libraries/python/TelemetryUtil.py index 2d4bb096c6..f8c7d8c9b5 100644 --- a/resources/libraries/python/TelemetryUtil.py +++ b/resources/libraries/python/TelemetryUtil.py @@ -14,8 +14,10 @@ """Telemetry utility.""" from robot.api import logger +from time import sleep from resources.libraries.python.Constants import Constants +from resources.libraries.python.VppCounters import VppCounters from resources.libraries.python.OptionString import OptionString from resources.libraries.python.ssh import exec_cmd, exec_cmd_no_error from resources.libraries.python.topology import NodeType @@ -119,6 +121,10 @@ class TelemetryUtil: f"{stdout}" ) + VppCounters.vpp_clear_runtime(node) + sleep(1) + VppCounters.vpp_show_runtime(node) + @staticmethod def run_telemetry_on_all_duts(nodes, profile): """Get telemetry stat read on all DUTs. diff --git a/resources/libraries/robot/performance/performance_actions.robot b/resources/libraries/robot/performance/performance_actions.robot index 3235dfa868..40f0bc9999 100644 --- a/resources/libraries/robot/performance/performance_actions.robot +++ b/resources/libraries/robot/performance/performance_actions.robot @@ -97,6 +97,78 @@ | | ... | ${nodes} | profile=vppctl_runtime.yaml | | Stop traffic on tg +| Additional Statistics Action For bpf-runtime +| | [Documentation] +| | ... | Additional Statistics Action for linux bundle counters with +| | ... | running traffic. +| | +| | ... | See documentation of the called keyword for required test variables. +| | +| | ${ppta} = | Get Packets Per Transaction Aggregated +| | ${ramp_up_duration} = | Get Ramp Up Duration +| | ${ramp_up_rate} = | Get Ramp Up Rate +| | ${runtime_duration} = | Get Runtime Duration +| | ${runtime_rate} = | Get Runtime Rate +| | ${traffic_directions} = | Get Traffic Directions +| | ${transaction_duration} = | Get Transaction Duration +| | ${transaction_scale} = | Get Transaction Scale +| | ${transaction_type} = | Get Transaction Type +| | ${use_latency} = | Get Use Latency +| | Send traffic on tg +| | ... | duration=${-1} +| | ... | rate=${runtime_rate} +| | ... | frame_size=${frame_size} +| | ... | traffic_profile=${traffic_profile} +| | ... | async_call=${True} +| | ... | ppta=${ppta} +| | ... | use_latency=${use_latency} +| | ... | traffic_directions=${traffic_directions} +| | ... | transaction_duration=${transaction_duration} +| | ... | transaction_scale=${transaction_scale} +| | ... | transaction_type=${transaction_type} +| | ... | duration_limit=${0.0} +| | ... | ramp_up_duration=${ramp_up_duration} +| | ... | ramp_up_rate=${ramp_up_rate} +| | Run Telemetry On All DUTs +| | ... | ${nodes} | profile=bpf_runtime.yaml +| | Stop traffic on tg + +| Additional Statistics Action For perf-stat-runtime +| | [Documentation] +| | ... | Additional Statistics Action for linux bundle counters with +| | ... | running traffic. +| | +| | ... | See documentation of the called keyword for required test variables. +| | +| | ${ppta} = | Get Packets Per Transaction Aggregated +| | ${ramp_up_duration} = | Get Ramp Up Duration +| | ${ramp_up_rate} = | Get Ramp Up Rate +| | ${runtime_duration} = | Get Runtime Duration +| | ${runtime_rate} = | Get Runtime Rate +| | ${traffic_directions} = | Get Traffic Directions +| | ${transaction_duration} = | Get Transaction Duration +| | ${transaction_scale} = | Get Transaction Scale +| | ${transaction_type} = | Get Transaction Type +| | ${use_latency} = | Get Use Latency +| | Send traffic on tg +| | ... | duration=${-1} +| | ... | rate=${runtime_rate} +| | ... | frame_size=${frame_size} +| | ... | traffic_profile=${traffic_profile} +| | ... | async_call=${True} +| | ... | ppta=${ppta} +| | ... | use_latency=${use_latency} +| | ... | traffic_directions=${traffic_directions} +| | ... | transaction_duration=${transaction_duration} +| | ... | transaction_scale=${transaction_scale} +| | ... | transaction_type=${transaction_type} +| | ... | duration_limit=${0.0} +| | ... | ramp_up_duration=${ramp_up_duration} +| | ... | ramp_up_rate=${ramp_up_rate} +| | Run Telemetry On All DUTs +| | ... | ${nodes} | profile=perf_stat_runtime.yaml +| | Stop traffic on tg + | Additional Statistics Action For vpp-runtime-iperf3 | | [Documentation] | | ... | Additional Statistics Action for clear and show runtime counters with diff --git a/resources/templates/telemetry/bpf_runtime.yaml b/resources/templates/telemetry/bpf_runtime.yaml index bb9d1c70ae..e2e1fd52f1 100644 --- a/resources/templates/telemetry/bpf_runtime.yaml +++ b/resources/templates/telemetry/bpf_runtime.yaml @@ -35,6 +35,7 @@ logging: handlers: [console_stdout, console_stderr] scheduler: duration: 1 + sample_period: 100 programs: - name: bundle_bpf metrics: @@ -46,6 +47,41 @@ programs: - name - cpu - pid + events: + - type: 0x4 # RAW + name: 0x3C # INTEL_CORE_E_CPU_CLK_UNHALTED_THREAD_P + target: on_cpu_cycle + table: cpu_cycle + code: | + #include <linux/ptrace.h> + #include <uapi/linux/bpf_perf_event.h> + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(cpu_cycle, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_cpu_cycle(struct bpf_perf_event_data *ctx) { + struct key_t key = {}; + get_key(&key); + + cpu_cycle.increment(key, ctx->sample_period); + return 0; + } + - name: bundle_bpf + metrics: + counter: - name: cpu_instruction documentation: Instructions retired by CPUs namespace: bpf @@ -53,15 +89,85 @@ programs: - name - cpu - pid - - name: llc_reference - documentation: Last level cache operations by type + events: + - type: 0x4 # RAW + name: 0xC0 # INTEL_CORE_E_INST_RETIRED_ANY_P + target: on_cpu_instruction + table: cpu_instruction + code: | + #include <linux/ptrace.h> + #include <uapi/linux/bpf_perf_event.h> + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(cpu_instruction, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_cpu_instruction(struct bpf_perf_event_data *ctx) { + struct key_t key = {}; + get_key(&key); + + cpu_instruction.increment(key, ctx->sample_period); + return 0; + } + - name: bundle_bpf + metrics: + counter: + - name: cache_references + documentation: Cache references namespace: bpf labelnames: - name - cpu - pid - - name: llc_miss - documentation: Last level cache operations by type + events: + - type: 0x0 # HARDWARE + name: 0x2 # PERF_COUNT_HW_CACHE_REFERENCES + target: on_cache_reference + table: cache_references + code: | + #include <linux/ptrace.h> + #include <uapi/linux/bpf_perf_event.h> + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(cache_references, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_cache_reference(struct bpf_perf_event_data *ctx) { + struct key_t key = {}; + get_key(&key); + + cache_references.increment(key, ctx->sample_period); + return 0; + } + - name: bundle_bpf + metrics: + counter: + - name: cache_miss + documentation: Cache misses namespace: bpf labelnames: - name @@ -69,21 +175,9 @@ programs: - pid events: - type: 0x0 # HARDWARE - name: 0x0 # PERF_COUNT_HW_CPU_CYCLES - target: on_cpu_cycle - table: cpu_cycle - - type: 0x0 # HARDWARE - name: 0x1 # PERF_COUNT_HW_INSTRUCTIONS - target: on_cpu_instruction - table: cpu_instruction - - type: 0x0 # HARDWARE - name: 0x2 # PERF_COUNT_HW_CACHE_REFERENCES - target: on_cache_reference - table: llc_reference - - type: 0x0 # HARDWARE name: 0x3 # PERF_COUNT_HW_CACHE_MISSES target: on_cache_miss - table: llc_miss + table: cache_miss code: | #include <linux/ptrace.h> #include <uapi/linux/bpf_perf_event.h> @@ -96,10 +190,7 @@ programs: char name[TASK_COMM_LEN]; }; - BPF_HASH(llc_miss, struct key_t); - BPF_HASH(llc_reference, struct key_t); - BPF_HASH(cpu_instruction, struct key_t); - BPF_HASH(cpu_cycle, struct key_t); + BPF_HASH(cache_miss, struct key_t); static inline __attribute__((always_inline)) void get_key(struct key_t* key) { key->cpu = bpf_get_smp_processor_id(); @@ -107,31 +198,555 @@ programs: bpf_get_current_comm(&(key->name), sizeof(key->name)); } - int on_cpu_cycle(struct bpf_perf_event_data *ctx) { + int on_cache_miss(struct bpf_perf_event_data *ctx) { struct key_t key = {}; get_key(&key); - cpu_cycle.increment(key, ctx->sample_period); + cache_miss.increment(key, ctx->sample_period); return 0; } - int on_cpu_instruction(struct bpf_perf_event_data *ctx) { +# - name: bundle_bpf +# metrics: +# counter: +# - name: branch_instruction +# documentation: Instructions retired by branch +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x4 # PERF_COUNT_HW_BRANCH_INSTRUCTION +# target: on_branch_instruction +# table: branch_instruction +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(branch_instruction, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_branch_instruction(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# branch_instruction.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: branch_misses (not supported by CPU) +# documentation: Last level miss operations by type +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x5 # PERF_COUNT_HW_BRANCH_MISSES +# target: on_branch_misses +# table: branch_misses +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(branch_misses, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_branch_misses(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# branch_misses.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: bus_cycles +# documentation: Count of bus cycles +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x6 # PERF_COUNT_HW_BUS_CYCLES +# target: on_bus_cycles +# table: bus_cycles +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(bus_cycles, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# int on_bus_cycles(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# bus_cycles.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: stalled_cycles_frontend (not supported by CPU) +# documentation: Frontend stalled cycles +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x7 # PERF_COUNT_HW_STALLED_CYCLES_FRONTEND +# target: on_stalled_cycles_frontend +# table: stalled_cycles_frontend +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(stalled_cycles_frontend, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_stalled_cycles_frontend(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# stalled_cycles_frontend.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: stalled_cycles_backend +# documentation: Backend stalled cycles +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x8 # PERF_COUNT_HW_STALLED_CYCLES_BACKEND +# target: on_stalled_cycles_backend +# table: stalled_cycles_backend +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(stalled_cycles_backend, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_stalled_cycles_backend(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# stalled_cycles_backend.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: referenced_cpu_cycles +# documentation: Referenced CPU cycles +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x9 # PERF_COUNT_HW_REF_CPU_CYCLES +# target: on_referenced_cpu_cycles +# table: referenced_cpu_cycles +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(referenced_cpu_cycles, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_referenced_cpu_cycles(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# referenced_cpu_cycles.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: sw_cpu_clock +# documentation: SW CPU clock +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x1 # SOFTWARE +# name: 0x0 # PERF_COUNT_SW_CPU_CLOCK +# target: on_sw_cpu_clock +# table: sw_cpu_clock +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(sw_cpu_clock, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_sw_cpu_clock(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# sw_cpu_clock.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: sw_task_clock +# documentation: SW task clock +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x1 # SOFTWARE +# name: 0x1 # PERF_COUNT_SW_TASK_CLOCK +# target: on_sw_task_clock +# table: sw_task_clock +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(sw_task_clock, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_sw_task_clock(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# sw_task_clock.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: sw_page_faults +# documentation: SW page faults +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x1 # SOFTWARE +# name: 0x2 # PERF_COUNT_SW_PAGE_FAULTS +# target: on_sw_page_faults +# table: sw_page_faults +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(sw_page_faults, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_sw_page_faults(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# sw_page_faults.increment(key, ctx->sample_period); +# return 0; +# } + - name: bundle_bpf + metrics: + counter: + - name: sw_context_switches + documentation: SW context switches + namespace: bpf + labelnames: + - name + - cpu + - pid + events: + - type: 0x1 # SOFTWARE + name: 0x3 # PERF_COUNT_SW_CONTEXT_SWITCHES + target: on_sw_context_switches + table: sw_context_switches + code: | + #include <linux/ptrace.h> + #include <uapi/linux/bpf_perf_event.h> + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(sw_context_switches, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_sw_context_switches(struct bpf_perf_event_data *ctx) { struct key_t key = {}; get_key(&key); - cpu_instruction.increment(key, ctx->sample_period); + sw_context_switches.increment(key, ctx->sample_period); return 0; } - int on_cache_reference(struct bpf_perf_event_data *ctx) { +# - name: bundle_bpf +# metrics: +# counter: +# - name: sw_cpu_migrations +# documentation: SW cpu migrations +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x1 # SOFTWARE +# name: 0x4 # PERF_COUNT_SW_CPU_MIGRATIONS +# target: on_sw_cpu_migrations +# table: sw_cpu_migrations +# code: | +# #include <linux/ptrace.h> +# #include <uapi/linux/bpf_perf_event.h> +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(sw_cpu_migrations, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_sw_cpu_migrations(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# sw_cpu_migrations.increment(key, ctx->sample_period); +# return 0; +# } + - name: bundle_bpf + metrics: + counter: + - name: sw_page_faults_min + documentation: SW page faults minor + namespace: bpf + labelnames: + - name + - cpu + - pid + events: + - type: 0x1 # SOFTWARE + name: 0x5 # PERF_COUNT_SW_PAGE_FAULTS_MIN + target: on_sw_page_faults_min + table: sw_page_faults_min + code: | + #include <linux/ptrace.h> + #include <uapi/linux/bpf_perf_event.h> + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(sw_page_faults_min, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_sw_page_faults_min(struct bpf_perf_event_data *ctx) { struct key_t key = {}; get_key(&key); - llc_reference.increment(key, ctx->sample_period); + sw_page_faults_min.increment(key, ctx->sample_period); return 0; } - int on_cache_miss(struct bpf_perf_event_data *ctx) { + - name: bundle_bpf + metrics: + counter: + - name: sw_page_faults_maj + documentation: SW page faults major + namespace: bpf + labelnames: + - name + - cpu + - pid + events: + - type: 0x1 # SOFTWARE + name: 0x6 # PERF_COUNT_SW_PAGE_FAULTS_MAJ + target: on_sw_page_faults_maj + table: sw_page_faults_maj + code: | + #include <linux/ptrace.h> + #include <uapi/linux/bpf_perf_event.h> + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(sw_page_faults_maj, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_sw_page_faults_maj(struct bpf_perf_event_data *ctx) { struct key_t key = {}; get_key(&key); - llc_miss.increment(key, ctx->sample_period); + sw_page_faults_maj.increment(key, ctx->sample_period); return 0; } diff --git a/resources/templates/telemetry/perf_stat_runtime.yaml b/resources/templates/telemetry/perf_stat_runtime.yaml new file mode 100644 index 0000000000..54b77a9bcc --- /dev/null +++ b/resources/templates/telemetry/perf_stat_runtime.yaml @@ -0,0 +1,142 @@ +--- +logging: + version: 1 + formatters: + console_stdout: + format: '%(asctime)s - %(name)s - %(message)s' + console_stderr: + format: '%(message)s' + prom: + format: '%(message)s' + handlers: + console_stdout: + class: logging.StreamHandler + level: INFO + formatter: console_stdout + stream: ext://sys.stdout + console_stderr: + class: logging.StreamHandler + level: ERROR + formatter: console_stderr + stream: ext://sys.stderr + prom: + class: logging.handlers.RotatingFileHandler + level: INFO + formatter: prom + filename: /tmp/metric.prom + mode: w + loggers: + prom: + handlers: [prom] + level: INFO + propagate: False + root: + level: INFO + handlers: [console_stdout, console_stderr] +scheduler: + duration: 1 +programs: + - name: bundle_perf_stat + metrics: + gauge: + - name: cpu-cycles + documentation: Cycles processed by CPUs + labelnames: + - name + - thread + - pid + events: + - name: cpu-cycles # 0x3C umask: 0x00 + EventCode: 0x3C + UMask: 0x00 + - name: bundle_perf_stat + metrics: + gauge: + - name: instructions + documentation: Instructions retired by CPUs + labelnames: + - name + - thread + - pid + events: + - name: instructions # 0xC0 umask: 0x00 + EventCode: 0xC0 + UMask: 0x00 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L1_HIT + documentation: L1 Hit + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L1_HIT # 0xD1 umask: 0x01 + EventCode: 0xD1 + UMask: 0x01 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L2_HIT + documentation: L2 Hit + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L2_HIT # 0xd1 umask: 0x02 + EventCode: 0xD1 + UMask: 0x02 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L3_HIT + documentation: L3 Hit + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L3_HIT # 0xd1 umask: 0x04 + EventCode: 0xD1 + UMask: 0x04 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L1_MISS + documentation: L1 Miss + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L1_MISS # 0xd1 umask: 0x08 + EventCode: 0xD1 + UMask: 0x08 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L2_MISS + documentation: L2 Miss + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L2_MISS # 0xd1 umask: 0x10 + EventCode: 0xD1 + UMask: 0x10 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L3_MISS + documentation: L3 Miss + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L3_MISS # 0xd1 umask: 0x020 + EventCode: 0xD1 + UMask: 0x20 diff --git a/resources/tools/telemetry/__main__.py b/resources/tools/telemetry/__main__.py index 2ab87b661a..7a612b8eea 100755 --- a/resources/tools/telemetry/__main__.py +++ b/resources/tools/telemetry/__main__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2021 Cisco and/or its affiliates. +# Copyright (c) 2022 Cisco and/or its affiliates. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: @@ -19,6 +19,7 @@ from argparse import ArgumentParser, RawDescriptionHelpFormatter from .executor import Executor + def main(): """ Main entry function when called from cli @@ -45,5 +46,6 @@ def main(): else: Executor(args.config).execute(args.hook) + if __name__ == u"__main__": main() diff --git a/resources/tools/telemetry/bundle_bpf.py b/resources/tools/telemetry/bundle_bpf.py index c376da9d63..58cfd5d0b6 100644 --- a/resources/tools/telemetry/bundle_bpf.py +++ b/resources/tools/telemetry/bundle_bpf.py @@ -52,12 +52,15 @@ class BundleBpf: self.obj = BPF(text=self.code) - def attach(self, duration): + + def attach(self, sample_period): """ Attach events to BPF. - :param duration: Trial duration. - :type duration: int + :param sample_period: A "sampling" event is one that generates + an overflow notification every N events, where N is given by + sample_period. + :type sample_period: int """ try: for event in self.events: @@ -65,15 +68,16 @@ class BundleBpf: ev_type=event[u"type"], ev_config=event[u"name"], fn_name=event[u"target"], - sample_period=duration + sample_period=sample_period ) except AttributeError: - getLogger("console_stderr").error(u"Could not attach BPF events!") + getLogger("console_stderr").error(f"Could not attach BPF event: " + f"{event[u'name']}") sys.exit(Constants.err_linux_attach) def detach(self): """ - Dettach events from BPF. + Detach events from BPF. """ try: for event in self.events: @@ -98,6 +102,9 @@ class BundleBpf: for _, metric_list in self.metrics.items(): for metric in metric_list: + if table_name != metric[u"name"]: + table_name = metric[u"name"] + text += f"{table_name}\n" for (key, val) in self.obj.get_table(metric[u"name"]).items(): item = dict() labels = dict() diff --git a/resources/tools/telemetry/bundle_perf_stat.py b/resources/tools/telemetry/bundle_perf_stat.py new file mode 100644 index 0000000000..038e86e7a0 --- /dev/null +++ b/resources/tools/telemetry/bundle_perf_stat.py @@ -0,0 +1,109 @@ +# Copyright (c) 2022 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Perf Stat performance bundle.""" + +from logging import getLogger +import sys +import subprocess + +from .constants import Constants + + +class BundlePerfStat: + """ + Creates a Perf stat object. This is the main object for defining a Perf Stat + program and interacting with its output. + + Syntax: perf stat [-e <EVENT> | --event=EVENT] [-a] — <command> [<options>] + """ + def __init__(self, program, serializer, hook): + """Initialize Bundle Perf Stat event class. + + :param program: events + :param serializer: Metric serializer. + :param hook: Process ID. + :type program: dict + :type serializer: Serializer + :type hook: int + """ + self.metrics = program[u"metrics"] + self.events = program[u"events"] + self.api_replies_list = list() + self.serializer = serializer + self.hook = hook + + def attach(self, duration=1): + """ + Performs perf stat. + + :param duration: Time how long perf stat is collecting data (in + seconds). Default value is 1 second. + :type duration: int + EventCode, UMask, EdgeDetect, AnyThread, Invert, CounterMask + """ + try: + self.serializer.create(metrics=self.metrics) + for event in self.events: + text = subprocess.getoutput( + f"""sudo perf stat -x\; -e\ + '{{cpu/event={hex(event[u"EventCode"])},\ + umask={hex(event[u"UMask"])}/u}}'\ + -a --per-thread\ + sleep {duration}""" + ) + + if text == u"": + getLogger("console_stdout").info(event[u"name"]) + continue + if u";" not in text: + getLogger("console_stdout").info( + f"Could not get counters for event \"{event[u'name']}\"" + f". Is it supported by CPU?" + ) + continue + + for line in text.splitlines(): + item = dict() + labels = dict() + item[u"name"] = event[u"name"] + item[u"value"] = line.split(";")[1] + labels["thread"] = u"-".join( + line.split(";")[0].split("-")[0:-1] + ) + labels["pid"] = line.split(";")[0].split("-")[-1] + labels["name"] = item[u"name"] + item[u"labels"] = labels + + getLogger("console_stdout").info(item) + self.api_replies_list.append(item) + + except AttributeError: + getLogger("console_stderr").error(f"Could not successfully run " + f"perf stat command.") + sys.exit(Constants.err_linux_perf_stat) + + def detach(self): + pass + + def fetch_data(self): + pass + + def process_data(self): + """ + Post process API replies. + """ + for item in self.api_replies_list: + self.serializer.serialize( + metric=item[u"name"], labels=item[u"labels"], item=item + ) diff --git a/resources/tools/telemetry/constants.py b/resources/tools/telemetry/constants.py index 9961a07b8b..5363ddeaa4 100644 --- a/resources/tools/telemetry/constants.py +++ b/resources/tools/telemetry/constants.py @@ -17,6 +17,7 @@ does not need to be hard coded here, but can be read from environment variables. """ + class Constants: """Constants used in telemetry. 1-10: Telemetry errors @@ -46,3 +47,7 @@ class Constants: # Could not detach BPF events err_linux_detach = 52 + + # Could not successfuly run perf stat command + err_linux_perf_stat = 53 + diff --git a/resources/tools/telemetry/metrics.py b/resources/tools/telemetry/metrics.py index 7a22acfd1b..ba6bae5e70 100644 --- a/resources/tools/telemetry/metrics.py +++ b/resources/tools/telemetry/metrics.py @@ -104,7 +104,7 @@ class Metric: u"Sample", [u"name", u"labels", u"value", u"timestamp"] ) - if not re.compile(r"^[a-zA-Z_:][a-zA-Z0-9_:]*$").match(name): + if not re.compile(r"^[a-zA-Z_:\-.][a-zA-Z0-9_:\-.]*$").match(name): raise ValueError(f"Invalid metric name: {name}!") if typ not in self.metric_types: raise ValueError(f"Invalid metric type: {typ}!") @@ -214,7 +214,7 @@ class MetricBase: full_name += f"{subsystem}_" if subsystem else u"" full_name += name - if not re.compile(r"^[a-zA-Z_:][a-zA-Z0-9_:]*$").match(full_name): + if not re.compile(r"^[a-zA-Z_:\-.][a-zA-Z0-9_:\-.]*$").match(full_name): raise ValueError( f"Invalid metric name: {full_name}!" ) diff --git a/resources/tools/telemetry/serializer.py b/resources/tools/telemetry/serializer.py index 3da857c0ab..e28454fc8b 100644 --- a/resources/tools/telemetry/serializer.py +++ b/resources/tools/telemetry/serializer.py @@ -19,7 +19,7 @@ from logging import getLogger class Serializer: """ - Executor class reponsible for executing configuration. + Executor class responsible for executing configuration. """ def __init__(self): """ |