From 3323e2018d6d736a25b15902bc85f559ea98adb5 Mon Sep 17 00:00:00 2001 From: Damjan Marion Date: Thu, 2 Dec 2021 11:28:57 +0100 Subject: vppinfra: add perf testing to test_vector_func Type: improvement Change-Id: I7aacd58d113c13036c15655817400032dd8d1932 Signed-off-by: Damjan Marion --- src/vppinfra/vector/test/array_mask.c | 2 +- src/vppinfra/vector/test/compress.c | 8 +- src/vppinfra/vector/test/count_equal.c | 4 +- src/vppinfra/vector/test/index_to_ptr.c | 2 +- src/vppinfra/vector/test/ip_csum.c | 65 +++++++- src/vppinfra/vector/test/mask_compare.c | 4 +- src/vppinfra/vector/test/memcpy_x86_64.c | 2 +- src/vppinfra/vector/test/test.c | 249 ++++++++++++++++++++++++++++++- src/vppinfra/vector/test/test.h | 73 ++++++++- 9 files changed, 386 insertions(+), 23 deletions(-) (limited to 'src/vppinfra/vector') diff --git a/src/vppinfra/vector/test/array_mask.c b/src/vppinfra/vector/test/array_mask.c index a1f4da728d4..904bb214f55 100644 --- a/src/vppinfra/vector/test/array_mask.c +++ b/src/vppinfra/vector/test/array_mask.c @@ -6,7 +6,7 @@ #include #include -__clib_test_fn void +__test_funct_fn void clib_array_mask_u32_wrapper (u32 *src, u32 mask, u32 n_elts) { clib_array_mask_u32 (src, mask, n_elts); diff --git a/src/vppinfra/vector/test/compress.c b/src/vppinfra/vector/test/compress.c index 9bc53ff1e41..4f3fd533340 100644 --- a/src/vppinfra/vector/test/compress.c +++ b/src/vppinfra/vector/test/compress.c @@ -6,25 +6,25 @@ #include #include -__clib_test_fn u32 +__test_funct_fn u32 clib_compress_u64_wrapper (u64 *dst, u64 *src, u64 *mask, u32 n_elts) { return clib_compress_u64 (dst, src, mask, n_elts); } -__clib_test_fn u32 +__test_funct_fn u32 clib_compress_u32_wrapper (u32 *dst, u32 *src, u64 *mask, u32 n_elts) { return clib_compress_u32 (dst, src, mask, n_elts); } -__clib_test_fn u32 +__test_funct_fn u32 clib_compress_u16_wrapper (u16 *dst, u16 *src, u64 *mask, u32 n_elts) { return clib_compress_u16 (dst, src, mask, n_elts); } -__clib_test_fn u32 +__test_funct_fn u32 clib_compress_u8_wrapper (u8 *dst, u8 *src, u64 *mask, u32 n_elts) { return clib_compress_u8 (dst, src, mask, n_elts); diff --git a/src/vppinfra/vector/test/count_equal.c b/src/vppinfra/vector/test/count_equal.c index 1ca9735af4b..c57b0275984 100644 --- a/src/vppinfra/vector/test/count_equal.c +++ b/src/vppinfra/vector/test/count_equal.c @@ -9,8 +9,8 @@ #define foreach_clib_count_equal(type) \ typedef uword (wrapper_fn_##type) (type * a, uword maxcount); \ \ - __clib_test_fn uword clib_count_equal_##type##_wrapper (type *a, \ - uword maxcount) \ + __test_funct_fn uword clib_count_equal_##type##_wrapper (type *a, \ + uword maxcount) \ { \ return clib_count_equal_##type (a, maxcount); \ } \ diff --git a/src/vppinfra/vector/test/index_to_ptr.c b/src/vppinfra/vector/test/index_to_ptr.c index ae33020328a..7ee3b94cb7c 100644 --- a/src/vppinfra/vector/test/index_to_ptr.c +++ b/src/vppinfra/vector/test/index_to_ptr.c @@ -9,7 +9,7 @@ typedef void (wrapper_fn) (u32 *indices, void *base, u8 shift, void **ptrs, u32 n_elts); -__clib_test_fn void +__test_funct_fn void clib_index_to_ptr_u32_wrapper (u32 *indices, void *base, u8 shift, void **ptrs, u32 n_elts) { diff --git a/src/vppinfra/vector/test/ip_csum.c b/src/vppinfra/vector/test/ip_csum.c index 135d5ae63b2..cb33c036120 100644 --- a/src/vppinfra/vector/test/ip_csum.c +++ b/src/vppinfra/vector/test/ip_csum.c @@ -80,7 +80,7 @@ static clib_error_t * test_clib_ip_csum (clib_error_t *err) { u8 *buf; - buf = clib_mem_alloc_aligned (65536, CLIB_CACHE_LINE_BYTES); + buf = test_mem_alloc (65536); for (int i = 0; i < 65536; i++) buf[i] = 0xf0 + ((i * 7) & 0xf); @@ -110,11 +110,72 @@ test_clib_ip_csum (clib_error_t *err) } } done: - clib_mem_free (buf); + test_mem_free (buf); return err; } +void __test_perf_fn +perftest_ip4_hdr (int fd, test_perf_t *tp) +{ + u32 n = tp->n_ops; + u8 *data = test_mem_alloc_and_splat (20, n, (void *) &test1); + u16 *res = test_mem_alloc (n * sizeof (u16)); + + test_perf_event_enable (fd); + for (int i = 0; i < n; i++) + res[i] = clib_ip_csum (data + i * 20, 20); + test_perf_event_disable (fd); + + test_mem_free (data); + test_mem_free (res); +} + +void __test_perf_fn +perftest_tcp_payload (int fd, test_perf_t *tp) +{ + u32 n = tp->n_ops; + volatile uword *lenp = &tp->arg0; + u8 *data = test_mem_alloc_and_splat (20, n, (void *) &test1); + u16 *res = test_mem_alloc (n * sizeof (u16)); + + test_perf_event_enable (fd); + for (int i = 0; i < n; i++) + res[i] = clib_ip_csum (data + i * lenp[0], lenp[0]); + test_perf_event_disable (fd); + + test_mem_free (data); + test_mem_free (res); +} + +void __test_perf_fn +perftest_byte (int fd, test_perf_t *tp) +{ + volatile uword *np = &tp->n_ops; + u8 *data = test_mem_alloc_and_fill_inc_u8 (*np, 0, 0); + u16 *res = test_mem_alloc (sizeof (u16)); + + test_perf_event_enable (fd); + res[0] = clib_ip_csum (data, np[0]); + test_perf_event_disable (fd); + + test_mem_free (data); + test_mem_free (res); +} + REGISTER_TEST (clib_ip_csum) = { .name = "clib_ip_csum", .fn = test_clib_ip_csum, + .perf_tests = PERF_TESTS ( + { .name = "ip4_hdr", + .op_name = "IP4Hdr", + .n_ops = 1024, + .fn = perftest_ip4_hdr }, + { .name = "tcp_paylaad", + .op_name = "1460Byte", + .n_ops = 16, + .arg0 = 1460, + .fn = perftest_tcp_payload }, + { .name = "byte", .op_name = "Byte", .n_ops = 16384, .fn = perftest_byte } + + ), }; diff --git a/src/vppinfra/vector/test/mask_compare.c b/src/vppinfra/vector/test/mask_compare.c index 64df0ee084a..40cac1b6ce3 100644 --- a/src/vppinfra/vector/test/mask_compare.c +++ b/src/vppinfra/vector/test/mask_compare.c @@ -6,13 +6,13 @@ #include #include -__clib_test_fn void +__test_funct_fn void clib_mask_compare_u16_wrapper (u16 v, u16 *a, u64 *mask, u32 n_elts) { clib_mask_compare_u16 (v, a, mask, n_elts); } -__clib_test_fn void +__test_funct_fn void clib_mask_compare_u32_wrapper (u32 v, u32 *a, u64 *mask, u32 n_elts) { clib_mask_compare_u32 (v, a, mask, n_elts); diff --git a/src/vppinfra/vector/test/memcpy_x86_64.c b/src/vppinfra/vector/test/memcpy_x86_64.c index 78aab18d2e3..edb32d05dc6 100644 --- a/src/vppinfra/vector/test/memcpy_x86_64.c +++ b/src/vppinfra/vector/test/memcpy_x86_64.c @@ -8,7 +8,7 @@ #include #include -__clib_test_fn void +__test_funct_fn void wrapper (u8 *dst, u8 *src, uword n) { clib_memcpy_x86_64 (dst, src, n); diff --git a/src/vppinfra/vector/test/test.c b/src/vppinfra/vector/test/test.c index 1a8b9d6ea10..d098766481c 100644 --- a/src/vppinfra/vector/test/test.c +++ b/src/vppinfra/vector/test/test.c @@ -4,8 +4,9 @@ #include #include +#include -test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS] = {}; +test_main_t test_main; int test_march_supported (clib_march_variant_type_t type) @@ -18,14 +19,12 @@ test_march_supported (clib_march_variant_type_t type) return 0; } -int -main (int argc, char *argv[]) +clib_error_t * +test_funct (test_main_t *tm) { - clib_mem_init (0, 64ULL << 20); - for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++) { - test_registration_t *r = test_registrations[i]; + test_registration_t *r = tm->registrations[i]; if (r == 0 || test_march_supported (i) < 0) continue; @@ -51,3 +50,241 @@ main (int argc, char *argv[]) fformat (stdout, "\n"); return 0; } + +#define TEST_PERF_MAX_EVENTS 7 +typedef struct +{ + u64 config[TEST_PERF_MAX_EVENTS]; + u8 n_events; + format_function_t *format_fn; +} test_perf_event_bundle_t; + +static u8 * +format_test_perf_bundle_default (u8 *s, va_list *args) +{ + test_perf_event_bundle_t __clib_unused *b = + va_arg (*args, test_perf_event_bundle_t *); + test_perf_t *tp = va_arg (*args, test_perf_t *); + u64 *data = va_arg (*args, u64 *); + + if (data) + s = format (s, "%5.2f", (f64) data[1] / data[0]); + else + s = format (s, "%5s", "IPC"); + + if (data) + s = format (s, "%8.2f", (f64) data[0] / tp->n_ops); + else + s = format (s, "%8s", "Clks/Op"); + + if (data) + s = format (s, "%8.2f", (f64) data[1] / tp->n_ops); + else + s = format (s, "%8s", "Inst/Op"); + + if (data) + s = format (s, "%9.2f", (f64) data[2] / tp->n_ops); + else + s = format (s, "%9s", "Brnch/Op"); + + if (data) + s = format (s, "%10.2f", (f64) data[3] / tp->n_ops); + else + s = format (s, "%10s", "BrMiss/Op"); + return s; +} + +test_perf_event_bundle_t perf_bundles[] = { { + .config[0] = PERF_COUNT_HW_CPU_CYCLES, + .config[1] = PERF_COUNT_HW_INSTRUCTIONS, + .config[2] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + .config[3] = PERF_COUNT_HW_BRANCH_MISSES, + .n_events = 4, + .format_fn = format_test_perf_bundle_default, +} }; + +#ifdef __linux__ +clib_error_t * +test_perf (test_main_t *tm) +{ + clib_error_t *err = 0; + test_perf_event_bundle_t *b = perf_bundles; + int group_fd = -1, fds[TEST_PERF_MAX_EVENTS]; + u64 count[TEST_PERF_MAX_EVENTS + 3] = {}; + struct perf_event_attr pe = { + .size = sizeof (struct perf_event_attr), + .type = PERF_TYPE_HARDWARE, + .disabled = 1, + .exclude_kernel = 1, + .exclude_hv = 1, + .pinned = 1, + .exclusive = 1, + .read_format = (PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING), + }; + + for (int i = 0; i < TEST_PERF_MAX_EVENTS; i++) + fds[i] = -1; + + for (int i = 0; i < b->n_events; i++) + { + pe.config = b->config[i]; + int fd = syscall (__NR_perf_event_open, &pe, /* pid */ 0, /* cpu */ -1, + /* group_fd */ group_fd, /* flags */ 0); + if (fd < 0) + { + err = clib_error_return_unix (0, "perf_event_open"); + goto done; + } + + if (group_fd == -1) + { + group_fd = fd; + pe.pinned = 0; + pe.exclusive = 0; + } + fds[i] = fd; + } + + for (int i = 0; i < CLIB_MARCH_TYPE_N_VARIANTS; i++) + { + test_registration_t *r = tm->registrations[i]; + + if (r == 0 || test_march_supported (i) < 0) + continue; + + fformat (stdout, "\nMultiarch Variant: %U\n", format_march_variant, i); + fformat (stdout, + "-------------------------------------------------------\n"); + while (r) + { + if (r->perf_tests) + { + test_perf_t *pt = r->perf_tests; + fformat (stdout, "%-22s%-12s%U\n", r->name, "OpType", + b->format_fn, b, pt, 0UL); + do + { + u32 read_size = (b->n_events + 3) * sizeof (u64); + for (int i = 0; i < tm->repeat; i++) + { + test_perf_event_reset (group_fd); + pt->fn (group_fd, pt); + if ((read (group_fd, &count, read_size) != read_size)) + { + err = clib_error_return_unix (0, "read"); + goto done; + } + if (count[1] != count[2]) + clib_warning ( + "perf counters were not running all the time." +#ifdef __x86_64__ + "\nConsider turning NMI watchdog off ('sysctl -w " + "kernel.nmi_watchdog=0')." +#endif + ); + fformat (stdout, " %-20s%-12s%U\n", pt->name, + pt->op_name ? pt->op_name : "", b->format_fn, b, + pt, count + 3); + } + } + while ((++pt)->fn); + } + r = r->next; + } + } + +done: + for (int i = 0; i < TEST_PERF_MAX_EVENTS; i++) + if (fds[i] != -1) + close (fds[i]); + return err; +} +#endif + +int +main (int argc, char *argv[]) +{ + test_main_t *tm = &test_main; + unformat_input_t _i = {}, *i = &_i; + clib_mem_init (0, 64ULL << 20); + clib_error_t *err; + int perf = 0; + + /* defaults */ + tm->repeat = 3; + + unformat_init_command_line (i, argv); + + while (unformat_check_input (i) != UNFORMAT_END_OF_INPUT) + { + if (unformat (i, "perf")) + perf = 1; + else if (unformat (i, "repeat %d", &tm->repeat)) + ; + else + { + clib_warning ("unknown input '%U'", format_unformat_error, i); + exit (1); + } + } + + if (perf) + err = test_perf (tm); + else + err = test_funct (tm); + + if (err) + { + clib_error_report (err); + fformat (stderr, "\n"); + return 1; + } + return 0; +} + +void * +test_mem_alloc (uword size) +{ + void *rv; + size = round_pow2 (size, CLIB_CACHE_LINE_BYTES); + rv = clib_mem_alloc_aligned (size, CLIB_CACHE_LINE_BYTES); + clib_memset_u8 (rv, 0, size); + return rv; +} + +void * +test_mem_alloc_and_fill_inc_u8 (uword size, u8 start, u8 mask) +{ + u8 *rv; + mask = mask ? mask : 0xff; + size = round_pow2 (size, CLIB_CACHE_LINE_BYTES); + rv = clib_mem_alloc_aligned (size, CLIB_CACHE_LINE_BYTES); + for (uword i = 0; i < size; i++) + rv[i] = ((u8) i + start) & mask; + return rv; +} + +void * +test_mem_alloc_and_splat (uword elt_size, uword n_elts, void *elt) +{ + u8 *rv, *e; + uword data_size = elt_size * n_elts; + uword alloc_size = round_pow2 (data_size, CLIB_CACHE_LINE_BYTES); + e = rv = clib_mem_alloc_aligned (alloc_size, CLIB_CACHE_LINE_BYTES); + while (e - rv < data_size) + { + clib_memcpy_fast (e, elt, elt_size); + e += elt_size; + } + + if (data_size < alloc_size) + clib_memset_u8 (e, 0, alloc_size - data_size); + return rv; +} + +void +test_mem_free (void *p) +{ + clib_mem_free (p); +} diff --git a/src/vppinfra/vector/test/test.h b/src/vppinfra/vector/test/test.h index bc499fb24e8..02169c1d6de 100644 --- a/src/vppinfra/vector/test/test.h +++ b/src/vppinfra/vector/test/test.h @@ -6,20 +6,44 @@ #define included_test_test_h #include +#ifdef __linux__ +#include +#include +#endif typedef clib_error_t *(test_fn_t) (clib_error_t *); +struct test_perf_; +typedef void (test_perf_fn_t) (int fd, struct test_perf_ *tp); + +typedef struct test_perf_ +{ + u64 n_ops; + u64 arg0; + char *op_name; + char *name; + test_perf_fn_t *fn; +} test_perf_t; + typedef struct test_registration_ { char *name; u8 multiarch : 1; test_fn_t *fn; + test_perf_t *perf_tests; + u32 n_perf_tests; struct test_registration_ *next; } test_registration_t; -extern test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS]; +typedef struct +{ + test_registration_t *registrations[CLIB_MARCH_TYPE_N_VARIANTS]; + u32 repeat; +} test_main_t; +extern test_main_t test_main; -#define __clib_test_fn static __clib_noinline __clib_section (".test_wrapper") +#define __test_funct_fn static __clib_noinline __clib_section (".test_func") +#define __test_perf_fn static __clib_noinline __clib_section (".test_perf") #define REGISTER_TEST(x) \ test_registration_t CLIB_MARCH_SFX (__test_##x); \ @@ -27,9 +51,50 @@ extern test_registration_t *test_registrations[CLIB_MARCH_TYPE_N_VARIANTS]; void) \ { \ test_registration_t *r = &CLIB_MARCH_SFX (__test_##x); \ - r->next = test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)]; \ - test_registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)] = r; \ + r->next = \ + test_main.registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)]; \ + test_main.registrations[CLIB_MARCH_SFX (CLIB_MARCH_VARIANT_TYPE)] = r; \ } \ test_registration_t CLIB_MARCH_SFX (__test_##x) +#define PERF_TESTS(...) \ + (test_perf_t[]) \ + { \ + __VA_ARGS__, {} \ + } + +static_always_inline void +test_perf_event_ioctl (int fd, u32 req) +{ +#ifdef __x86_64__ + asm inline("syscall" + : + : "D"(fd), "S"(req), "a"(__NR_ioctl), "d"(PERF_IOC_FLAG_GROUP) + : "rcx", "r11" /* registers modified by kernel */); +#else + ioctl (fd, req, PERF_IOC_FLAG_GROUP); +#endif +} + +static_always_inline void +test_perf_event_reset (int fd) +{ + test_perf_event_ioctl (fd, PERF_EVENT_IOC_RESET); +} +static_always_inline void +test_perf_event_enable (int fd) +{ + test_perf_event_ioctl (fd, PERF_EVENT_IOC_ENABLE); +} +static_always_inline void +test_perf_event_disable (int fd) +{ + test_perf_event_ioctl (fd, PERF_EVENT_IOC_DISABLE); +} + +void *test_mem_alloc (uword size); +void *test_mem_alloc_and_fill_inc_u8 (uword size, u8 start, u8 mask); +void *test_mem_alloc_and_splat (uword elt_size, uword n_elts, void *elt); +void test_mem_free (void *p); + #endif -- cgit 1.2.3-korg