From a2182abd2665aa9264464a99ad77718e2c7bbe18 Mon Sep 17 00:00:00 2001 From: Viliam Luc Date: Wed, 13 Apr 2022 14:00:44 +0200 Subject: telemetry: linux telemetry with perf-stat Signed-off-by: Viliam Luc Change-Id: I17ced17a309cc0ac21c5fc94e570c89a456339e2 --- resources/templates/telemetry/bpf_runtime.yaml | 673 ++++++++++++++++++++- .../templates/telemetry/perf_stat_runtime.yaml | 142 +++++ 2 files changed, 786 insertions(+), 29 deletions(-) create mode 100644 resources/templates/telemetry/perf_stat_runtime.yaml (limited to 'resources/templates/telemetry') diff --git a/resources/templates/telemetry/bpf_runtime.yaml b/resources/templates/telemetry/bpf_runtime.yaml index bb9d1c70ae..e2e1fd52f1 100644 --- a/resources/templates/telemetry/bpf_runtime.yaml +++ b/resources/templates/telemetry/bpf_runtime.yaml @@ -35,6 +35,7 @@ logging: handlers: [console_stdout, console_stderr] scheduler: duration: 1 + sample_period: 100 programs: - name: bundle_bpf metrics: @@ -46,6 +47,41 @@ programs: - name - cpu - pid + events: + - type: 0x4 # RAW + name: 0x3C # INTEL_CORE_E_CPU_CLK_UNHALTED_THREAD_P + target: on_cpu_cycle + table: cpu_cycle + code: | + #include + #include + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(cpu_cycle, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_cpu_cycle(struct bpf_perf_event_data *ctx) { + struct key_t key = {}; + get_key(&key); + + cpu_cycle.increment(key, ctx->sample_period); + return 0; + } + - name: bundle_bpf + metrics: + counter: - name: cpu_instruction documentation: Instructions retired by CPUs namespace: bpf @@ -53,37 +89,95 @@ programs: - name - cpu - pid - - name: llc_reference - documentation: Last level cache operations by type + events: + - type: 0x4 # RAW + name: 0xC0 # INTEL_CORE_E_INST_RETIRED_ANY_P + target: on_cpu_instruction + table: cpu_instruction + code: | + #include + #include + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(cpu_instruction, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_cpu_instruction(struct bpf_perf_event_data *ctx) { + struct key_t key = {}; + get_key(&key); + + cpu_instruction.increment(key, ctx->sample_period); + return 0; + } + - name: bundle_bpf + metrics: + counter: + - name: cache_references + documentation: Cache references namespace: bpf labelnames: - name - cpu - pid - - name: llc_miss - documentation: Last level cache operations by type + events: + - type: 0x0 # HARDWARE + name: 0x2 # PERF_COUNT_HW_CACHE_REFERENCES + target: on_cache_reference + table: cache_references + code: | + #include + #include + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(cache_references, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_cache_reference(struct bpf_perf_event_data *ctx) { + struct key_t key = {}; + get_key(&key); + + cache_references.increment(key, ctx->sample_period); + return 0; + } + - name: bundle_bpf + metrics: + counter: + - name: cache_miss + documentation: Cache misses namespace: bpf labelnames: - name - cpu - pid events: - - type: 0x0 # HARDWARE - name: 0x0 # PERF_COUNT_HW_CPU_CYCLES - target: on_cpu_cycle - table: cpu_cycle - - type: 0x0 # HARDWARE - name: 0x1 # PERF_COUNT_HW_INSTRUCTIONS - target: on_cpu_instruction - table: cpu_instruction - - type: 0x0 # HARDWARE - name: 0x2 # PERF_COUNT_HW_CACHE_REFERENCES - target: on_cache_reference - table: llc_reference - type: 0x0 # HARDWARE name: 0x3 # PERF_COUNT_HW_CACHE_MISSES target: on_cache_miss - table: llc_miss + table: cache_miss code: | #include #include @@ -96,10 +190,7 @@ programs: char name[TASK_COMM_LEN]; }; - BPF_HASH(llc_miss, struct key_t); - BPF_HASH(llc_reference, struct key_t); - BPF_HASH(cpu_instruction, struct key_t); - BPF_HASH(cpu_cycle, struct key_t); + BPF_HASH(cache_miss, struct key_t); static inline __attribute__((always_inline)) void get_key(struct key_t* key) { key->cpu = bpf_get_smp_processor_id(); @@ -107,31 +198,555 @@ programs: bpf_get_current_comm(&(key->name), sizeof(key->name)); } - int on_cpu_cycle(struct bpf_perf_event_data *ctx) { + int on_cache_miss(struct bpf_perf_event_data *ctx) { struct key_t key = {}; get_key(&key); - cpu_cycle.increment(key, ctx->sample_period); + cache_miss.increment(key, ctx->sample_period); return 0; } - int on_cpu_instruction(struct bpf_perf_event_data *ctx) { +# - name: bundle_bpf +# metrics: +# counter: +# - name: branch_instruction +# documentation: Instructions retired by branch +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x4 # PERF_COUNT_HW_BRANCH_INSTRUCTION +# target: on_branch_instruction +# table: branch_instruction +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(branch_instruction, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_branch_instruction(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# branch_instruction.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: branch_misses (not supported by CPU) +# documentation: Last level miss operations by type +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x5 # PERF_COUNT_HW_BRANCH_MISSES +# target: on_branch_misses +# table: branch_misses +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(branch_misses, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_branch_misses(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# branch_misses.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: bus_cycles +# documentation: Count of bus cycles +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x6 # PERF_COUNT_HW_BUS_CYCLES +# target: on_bus_cycles +# table: bus_cycles +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(bus_cycles, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# int on_bus_cycles(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# bus_cycles.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: stalled_cycles_frontend (not supported by CPU) +# documentation: Frontend stalled cycles +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x7 # PERF_COUNT_HW_STALLED_CYCLES_FRONTEND +# target: on_stalled_cycles_frontend +# table: stalled_cycles_frontend +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(stalled_cycles_frontend, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_stalled_cycles_frontend(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# stalled_cycles_frontend.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: stalled_cycles_backend +# documentation: Backend stalled cycles +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x8 # PERF_COUNT_HW_STALLED_CYCLES_BACKEND +# target: on_stalled_cycles_backend +# table: stalled_cycles_backend +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(stalled_cycles_backend, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_stalled_cycles_backend(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# stalled_cycles_backend.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: referenced_cpu_cycles +# documentation: Referenced CPU cycles +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x0 # HARDWARE +# name: 0x9 # PERF_COUNT_HW_REF_CPU_CYCLES +# target: on_referenced_cpu_cycles +# table: referenced_cpu_cycles +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(referenced_cpu_cycles, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_referenced_cpu_cycles(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# referenced_cpu_cycles.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: sw_cpu_clock +# documentation: SW CPU clock +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x1 # SOFTWARE +# name: 0x0 # PERF_COUNT_SW_CPU_CLOCK +# target: on_sw_cpu_clock +# table: sw_cpu_clock +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(sw_cpu_clock, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_sw_cpu_clock(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# sw_cpu_clock.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: sw_task_clock +# documentation: SW task clock +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x1 # SOFTWARE +# name: 0x1 # PERF_COUNT_SW_TASK_CLOCK +# target: on_sw_task_clock +# table: sw_task_clock +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(sw_task_clock, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_sw_task_clock(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# sw_task_clock.increment(key, ctx->sample_period); +# return 0; +# } +# - name: bundle_bpf +# metrics: +# counter: +# - name: sw_page_faults +# documentation: SW page faults +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x1 # SOFTWARE +# name: 0x2 # PERF_COUNT_SW_PAGE_FAULTS +# target: on_sw_page_faults +# table: sw_page_faults +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(sw_page_faults, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_sw_page_faults(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# sw_page_faults.increment(key, ctx->sample_period); +# return 0; +# } + - name: bundle_bpf + metrics: + counter: + - name: sw_context_switches + documentation: SW context switches + namespace: bpf + labelnames: + - name + - cpu + - pid + events: + - type: 0x1 # SOFTWARE + name: 0x3 # PERF_COUNT_SW_CONTEXT_SWITCHES + target: on_sw_context_switches + table: sw_context_switches + code: | + #include + #include + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(sw_context_switches, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_sw_context_switches(struct bpf_perf_event_data *ctx) { struct key_t key = {}; get_key(&key); - cpu_instruction.increment(key, ctx->sample_period); + sw_context_switches.increment(key, ctx->sample_period); return 0; } - int on_cache_reference(struct bpf_perf_event_data *ctx) { +# - name: bundle_bpf +# metrics: +# counter: +# - name: sw_cpu_migrations +# documentation: SW cpu migrations +# namespace: bpf +# labelnames: +# - name +# - cpu +# - pid +# events: +# - type: 0x1 # SOFTWARE +# name: 0x4 # PERF_COUNT_SW_CPU_MIGRATIONS +# target: on_sw_cpu_migrations +# table: sw_cpu_migrations +# code: | +# #include +# #include +# +# const int max_cpus = 256; +# +# struct key_t { +# int cpu; +# int pid; +# char name[TASK_COMM_LEN]; +# }; +# +# BPF_HASH(sw_cpu_migrations, struct key_t); +# +# static inline __attribute__((always_inline)) void get_key(struct key_t* key) { +# key->cpu = bpf_get_smp_processor_id(); +# key->pid = bpf_get_current_pid_tgid(); +# bpf_get_current_comm(&(key->name), sizeof(key->name)); +# } +# +# int on_sw_cpu_migrations(struct bpf_perf_event_data *ctx) { +# struct key_t key = {}; +# get_key(&key); +# +# sw_cpu_migrations.increment(key, ctx->sample_period); +# return 0; +# } + - name: bundle_bpf + metrics: + counter: + - name: sw_page_faults_min + documentation: SW page faults minor + namespace: bpf + labelnames: + - name + - cpu + - pid + events: + - type: 0x1 # SOFTWARE + name: 0x5 # PERF_COUNT_SW_PAGE_FAULTS_MIN + target: on_sw_page_faults_min + table: sw_page_faults_min + code: | + #include + #include + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(sw_page_faults_min, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_sw_page_faults_min(struct bpf_perf_event_data *ctx) { struct key_t key = {}; get_key(&key); - llc_reference.increment(key, ctx->sample_period); + sw_page_faults_min.increment(key, ctx->sample_period); return 0; } - int on_cache_miss(struct bpf_perf_event_data *ctx) { + - name: bundle_bpf + metrics: + counter: + - name: sw_page_faults_maj + documentation: SW page faults major + namespace: bpf + labelnames: + - name + - cpu + - pid + events: + - type: 0x1 # SOFTWARE + name: 0x6 # PERF_COUNT_SW_PAGE_FAULTS_MAJ + target: on_sw_page_faults_maj + table: sw_page_faults_maj + code: | + #include + #include + + const int max_cpus = 256; + + struct key_t { + int cpu; + int pid; + char name[TASK_COMM_LEN]; + }; + + BPF_HASH(sw_page_faults_maj, struct key_t); + + static inline __attribute__((always_inline)) void get_key(struct key_t* key) { + key->cpu = bpf_get_smp_processor_id(); + key->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&(key->name), sizeof(key->name)); + } + + int on_sw_page_faults_maj(struct bpf_perf_event_data *ctx) { struct key_t key = {}; get_key(&key); - llc_miss.increment(key, ctx->sample_period); + sw_page_faults_maj.increment(key, ctx->sample_period); return 0; } diff --git a/resources/templates/telemetry/perf_stat_runtime.yaml b/resources/templates/telemetry/perf_stat_runtime.yaml new file mode 100644 index 0000000000..54b77a9bcc --- /dev/null +++ b/resources/templates/telemetry/perf_stat_runtime.yaml @@ -0,0 +1,142 @@ +--- +logging: + version: 1 + formatters: + console_stdout: + format: '%(asctime)s - %(name)s - %(message)s' + console_stderr: + format: '%(message)s' + prom: + format: '%(message)s' + handlers: + console_stdout: + class: logging.StreamHandler + level: INFO + formatter: console_stdout + stream: ext://sys.stdout + console_stderr: + class: logging.StreamHandler + level: ERROR + formatter: console_stderr + stream: ext://sys.stderr + prom: + class: logging.handlers.RotatingFileHandler + level: INFO + formatter: prom + filename: /tmp/metric.prom + mode: w + loggers: + prom: + handlers: [prom] + level: INFO + propagate: False + root: + level: INFO + handlers: [console_stdout, console_stderr] +scheduler: + duration: 1 +programs: + - name: bundle_perf_stat + metrics: + gauge: + - name: cpu-cycles + documentation: Cycles processed by CPUs + labelnames: + - name + - thread + - pid + events: + - name: cpu-cycles # 0x3C umask: 0x00 + EventCode: 0x3C + UMask: 0x00 + - name: bundle_perf_stat + metrics: + gauge: + - name: instructions + documentation: Instructions retired by CPUs + labelnames: + - name + - thread + - pid + events: + - name: instructions # 0xC0 umask: 0x00 + EventCode: 0xC0 + UMask: 0x00 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L1_HIT + documentation: L1 Hit + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L1_HIT # 0xD1 umask: 0x01 + EventCode: 0xD1 + UMask: 0x01 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L2_HIT + documentation: L2 Hit + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L2_HIT # 0xd1 umask: 0x02 + EventCode: 0xD1 + UMask: 0x02 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L3_HIT + documentation: L3 Hit + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L3_HIT # 0xd1 umask: 0x04 + EventCode: 0xD1 + UMask: 0x04 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L1_MISS + documentation: L1 Miss + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L1_MISS # 0xd1 umask: 0x08 + EventCode: 0xD1 + UMask: 0x08 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L2_MISS + documentation: L2 Miss + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L2_MISS # 0xd1 umask: 0x10 + EventCode: 0xD1 + UMask: 0x10 + - name: bundle_perf_stat + metrics: + gauge: + - name: MEM_LOAD_UOPS_RETIRED.L3_MISS + documentation: L3 Miss + labelnames: + - name + - thread + - pid + events: + - name: MEM_LOAD_UOPS_RETIRED.L3_MISS # 0xd1 umask: 0x020 + EventCode: 0xD1 + UMask: 0x20 -- cgit 1.2.3-korg