/* * Copyright (c) 2020 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <vnet/vnet.h> #include <vlibapi/api.h> #include <vlibmemory/api.h> #include <vnet/plugin/plugin.h> #include <vpp/app/version.h> #include <linux/limits.h> #include <sys/ioctl.h> #include <perfmon/perfmon.h> perfmon_main_t perfmon_main; VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, .description = "Performance Monitor", }; VLIB_REGISTER_LOG_CLASS (if_default_log, static) = { .class_name = "perfmon", }; #define log_debug(fmt, ...) \ vlib_log_debug (if_default_log.class, fmt, __VA_ARGS__) #define log_warn(fmt, ...) \ vlib_log_warn (if_default_log.class, fmt, __VA_ARGS__) #define log_err(fmt, ...) vlib_log_err (if_default_log.class, fmt, __VA_ARGS__) void perfmon_reset (vlib_main_t *vm) { perfmon_main_t *pm = &perfmon_main; uword page_size = clib_mem_get_page_size (); if (pm->is_running) for (int i = 0; i < vlib_get_n_threads (); i++) vlib_node_set_dispatch_wrapper (vlib_get_main_by_index (i), 0); for (int i = 0; i < vec_len (pm->fds_to_close); i++) close (pm->fds_to_close[i]); vec_free (pm->fds_to_close); vec_free (pm->group_fds); if (pm->default_instance_type) { perfmon_instance_type_t *it = pm->default_instance_type; for (int i = 0; i < vec_len (it->instances); i++) vec_free (it->instances[i].name); vec_free (it->instances); vec_free (pm->default_instance_type); } for (int i = 0; i < vec_len (pm->thread_runtimes); i++) { perfmon_thread_runtime_t *tr = vec_elt_at_index (pm->thread_runtimes, i); vec_free (tr->node_stats); for (int j = 0; j < PERF_MAX_EVENTS; j++) if (tr->mmap_pages[j]) munmap (tr->mmap_pages[j], page_size); } vec_free (pm->thread_runtimes); pm->is_running = 0; pm->active_instance_type = 0; pm->active_bundle = 0; } static clib_error_t * perfmon_set (vlib_main_t *vm, perfmon_bundle_t *b) { clib_error_t *err = 0; perfmon_main_t *pm = &perfmon_main; perfmon_source_t *s; int is_node = 0; int n_nodes = vec_len (vm->node_main.nodes); uword page_size = clib_mem_get_page_size (); u32 instance_type = 0; perfmon_event_t *e; perfmon_instance_type_t *it = 0; perfmon_reset (vm); s = b->src; ASSERT (b->n_events); if (b->active_type == PERFMON_BUNDLE_TYPE_NODE) is_node = 1; if (s->instances_by_type == 0) { vec_add2 (pm->default_instance_type, it, 1); it->name = is_node ? "Thread/Node" : "Thread"; for (int i = 0; i < vlib_get_n_threads (); i++) { vlib_worker_thread_t *w = vlib_worker_threads + i; perfmon_instance_t *in; vec_add2 (it->instances, in, 1); in->cpu = w->cpu_id; in->pid = w->lwp; in->name = (char *) format (0, "%s (%u)%c", w->name, i, 0); } if (is_node) vec_validate (pm->thread_runtimes, vlib_get_n_threads () - 1); } else { e = s->events + b->events[0]; if (e->type_from_instance) { instance_type = e->instance_type; for (int i = 1; i < b->n_events; i++) { e = s->events + b->events[i]; ASSERT (e->type_from_instance == 1 && e->instance_type == instance_type); } } it = vec_elt_at_index (s->instances_by_type, instance_type); } pm->active_instance_type = it; for (int i = 0; i < vec_len (it->instances); i++) { perfmon_instance_t *in = vec_elt_at_index (it->instances, i); vec_validate (pm->group_fds, i); pm->group_fds[i] = -1; for (int j = 0; j < b->n_events; j++) { int fd; perfmon_event_t *e = s->events + b->events[j]; struct perf_event_attr pe = { .size = sizeof (struct perf_event_attr), .type = e->type_from_instance ? in->type : e->type, .config = e->config, .exclude_kernel = e->exclude_kernel, .read_format = (PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING), .disabled = 1, }; log_debug ("perf_event_open pe.type=%u pe.config=0x%x pid=%d " "cpu=%d group_fd=%d", pe.type, pe.config, in->pid, in->cpu, pm->group_fds[i]); fd = syscall (__NR_perf_event_open, &pe, in->pid, in->cpu, pm->group_fds[i], 0); if (fd == -1) { err = clib_error_return_unix (0, "perf_event_open"); goto error; } vec_add1 (pm->fds_to_close, fd); if (pm->group_fds[i] == -1) pm->group_fds[i] = fd; if (is_node) { perfmon_thread_runtime_t *tr; tr = vec_elt_at_index (pm->thread_runtimes, i); tr->mmap_pages[j] = mmap (0, page_size, PROT_READ, MAP_SHARED, fd, 0); if (tr->mmap_pages[j] == MAP_FAILED) { err = clib_error_return_unix (0, "mmap"); goto error; } } } if (is_node) { perfmon_thread_runtime_t *rt; rt = vec_elt_at_index (pm->thread_runtimes, i); rt->bundle = b; rt->n_events = b->n_events; rt->n_nodes = n_nodes; rt->preserve_samples = b->preserve_samples; vec_validate_aligned (rt->node_stats, n_nodes - 1, CLIB_CACHE_LINE_BYTES); } } pm->active_bundle = b; error: if (err) { log_err ("%U", format_clib_error, err); perfmon_reset (vm); } return err; } static_always_inline u32 perfmon_mmap_read_index (const struct perf_event_mmap_page *mmap_page) { u32 idx; u32 seq; /* See documentation in /usr/include/linux/perf_event.h, for more details * but the 2 main important things are: * 1) if seq != mmap_page->lock, it means the kernel is currently updating * the user page and we need to read it again * 2) if idx == 0, it means the perf event is currently turned off and we * just need to read the kernel-updated 'offset', otherwise we must also * add the current hw value (hence rdmpc) */ do { seq = mmap_page->lock; CLIB_COMPILER_BARRIER (); idx = mmap_page->index; CLIB_COMPILER_BARRIER (); } while (mmap_page->lock != seq); return idx; } clib_error_t * perfmon_start (vlib_main_t *vm, perfmon_bundle_t *b) { clib_error_t *err = 0; perfmon_main_t *pm = &perfmon_main; int n_groups; if (pm->is_running == 1) return clib_error_return (0, "already running"); if ((err = perfmon_set (vm, b)) != 0) return err; n_groups = vec_len (pm->group_fds); for (int i = 0; i < n_groups; i++) { if (ioctl (pm->group_fds[i], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) { perfmon_reset (vm); return clib_error_return_unix (0, "ioctl(PERF_EVENT_IOC_ENABLE)"); } } if (b->active_type == PERFMON_BUNDLE_TYPE_NODE) { for (int i = 0; i < vec_len (pm->thread_runtimes); i++) { perfmon_thread_runtime_t *tr; tr = vec_elt_at_index (pm->thread_runtimes, i); for (int j = 0; j < b->n_events; j++) { tr->indexes[j] = perfmon_mmap_read_index (tr->mmap_pages[j]); /* if a zero index is returned generate error */ if (!tr->indexes[j]) { perfmon_reset (vm); return clib_error_return (0, "invalid rdpmc index"); } } } for (int i = 0; i < vlib_get_n_threads (); i++) vlib_node_set_dispatch_wrapper ( vlib_get_main_by_index (i), perfmon_dispatch_wrappers[b->n_events]); } pm->sample_time = vlib_time_now (vm); pm->is_running = 1; return 0; } clib_error_t * perfmon_stop (vlib_main_t *vm) { perfmon_main_t *pm = &perfmon_main; int n_groups = vec_len (pm->group_fds); if (pm->is_running != 1) return clib_error_return (0, "not running"); if (pm->active_bundle->active_type == PERFMON_BUNDLE_TYPE_NODE) { for (int i = 0; i < vlib_get_n_threads (); i++) vlib_node_set_dispatch_wrapper (vlib_get_main_by_index (i), 0); } for (int i = 0; i < n_groups; i++) { if (ioctl (pm->group_fds[i], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) { perfmon_reset (vm); return clib_error_return_unix (0, "ioctl(PERF_EVENT_IOC_DISABLE)"); } } pm->is_running = 0; pm->sample_time = vlib_time_now (vm) - pm->sample_time; return 0; } static_always_inline u8 is_enough_counters (perfmon_bundle_t *b) { u8 bl[PERFMON_EVENT_TYPE_MAX]; u8 cpu[PERFMON_EVENT_TYPE_MAX]; clib_memset (&bl, 0, sizeof (bl)); clib_memset (&cpu, 0, sizeof (cpu)); /* how many does this uarch support */ if (!clib_get_pmu_counter_count (&cpu[PERFMON_EVENT_TYPE_FIXED], &cpu[PERFMON_EVENT_TYPE_GENERAL])) return 0; /* how many does the bundle require */ for (u16 i = 0; i < b->n_events; i++) { /* if source allows us to identify events, otherwise assume general */ if (b->src->get_event_type) bl[b->src->get_event_type (b->events[i])]++; else bl[PERFMON_EVENT_TYPE_GENERAL]++; } /* consciously ignoring pseudo events here */ return cpu[PERFMON_EVENT_TYPE_GENERAL] >= bl[PERFMON_EVENT_TYPE_GENERAL] && cpu[PERFMON_EVENT_TYPE_FIXED] >= bl[PERFMON_EVENT_TYPE_FIXED]; } static_always_inline u8 is_bundle_supported (perfmon_bundle_t *b) { perfmon_cpu_supports_t *supports = b->cpu_supports; if (!is_enough_counters (b)) return 0; if (!b->cpu_supports) return 1; for (int i = 0; i < b->n_cpu_supports; ++i) if (supports[i].cpu_supports ()) return 1; return 0; } static clib_error_t * perfmon_init (vlib_main_t *vm) { perfmon_main_t *pm = &perfmon_main; perfmon_source_t *s = pm->sources; perfmon_bundle_t *b = pm->bundles; pm->source_by_name = hash_create_string (0, sizeof (uword)); while (s) { clib_error_t *err; if (hash_get_mem (pm->source_by_name, s->name) != 0) clib_panic ("duplicate source name '%s'", s->name); if (s->init_fn && ((err = (s->init_fn) (vm, s)))) { log_warn ("skipping source '%s' - %U", s->name, format_clib_error, err); clib_error_free (err); s = s->next; continue; } hash_set_mem (pm->source_by_name, s->name, s); log_debug ("source '%s' regisrtered", s->name); s = s->next; } pm->bundle_by_name = hash_create_string (0, sizeof (uword)); while (b) { clib_error_t *err; uword *p; if ((p = hash_get_mem (pm->source_by_name, b->source)) == 0) { log_debug ("missing source '%s', skipping bundle '%s'", b->source, b->name); b = b->next; continue; } b->src = (perfmon_source_t *) p[0]; if (!is_bundle_supported (b)) { log_debug ("skipping bundle '%s' - not supported", b->name); b = b->next; continue; } if (b->init_fn && ((err = (b->init_fn) (vm, b)))) { log_warn ("skipping bundle '%s' - %U", b->name, format_clib_error, err); clib_error_free (err); b = b->next; continue; } if (hash_get_mem (pm->bundle_by_name, b->name) != 0) clib_panic ("duplicate bundle name '%s'", b->name); hash_set_mem (pm->bundle_by_name, b->name, b); log_debug ("bundle '%s' regisrtered", b->name); b = b->next; } return 0; } VLIB_INIT_FUNCTION (perfmon_init);