From 6de58f5fd0c9d47a65e24d3617e465b9fa8d8872 Mon Sep 17 00:00:00 2001 From: Nathan Skrzypczak Date: Mon, 24 Jan 2022 17:10:41 +0100 Subject: cnat: maglev fixes & improvements This fixes the maglev logic which previously included a wrong simplication. It moves the maglev logic to its own file, and adds a test function in the debug cli. Type: improvement Change-Id: I2790ae2a26fc1c5739ff02f41d436bfcafd5b380 Signed-off-by: Nathan Skrzypczak --- src/plugins/cnat/CMakeLists.txt | 1 + src/plugins/cnat/cnat_maglev.c | 375 ++++++++++++++++++++++++++++++++++++ src/plugins/cnat/cnat_maglev.h | 21 ++ src/plugins/cnat/cnat_translation.c | 104 +--------- src/plugins/cnat/cnat_types.h | 2 + src/vppinfra/format.h | 1 + src/vppinfra/std-formats.c | 46 +++++ 7 files changed, 447 insertions(+), 103 deletions(-) create mode 100644 src/plugins/cnat/cnat_maglev.c create mode 100644 src/plugins/cnat/cnat_maglev.h (limited to 'src') diff --git a/src/plugins/cnat/CMakeLists.txt b/src/plugins/cnat/CMakeLists.txt index cfb55661a78..e99bf056a35 100644 --- a/src/plugins/cnat/CMakeLists.txt +++ b/src/plugins/cnat/CMakeLists.txt @@ -24,6 +24,7 @@ add_vpp_plugin(cnat cnat_types.c cnat_snat_policy.c cnat_src_policy.c + cnat_maglev.c API_FILES cnat.api diff --git a/src/plugins/cnat/cnat_maglev.c b/src/plugins/cnat/cnat_maglev.c new file mode 100644 index 00000000000..669c6479619 --- /dev/null +++ b/src/plugins/cnat/cnat_maglev.c @@ -0,0 +1,375 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2022 Cisco Systems, Inc. + */ + +#include + +static int +cnat_maglev_perm_compare (void *_a, void *_b) +{ + return *(u64 *) _b - *(u64 *) _a; +} + +/** + * Maglev algorithm implementation. This takes permutation as input, + * with the values of offset & skip for the backends. + * It fills buckets matching the permuntations, provided buckets is + * already of length at least M + */ +static void +cnat_maglev_shuffle (cnat_maglev_perm_t *permutation, u32 *buckets) +{ + u32 N, M, i, done = 0; + u32 *next = 0; + + N = vec_len (permutation); + if (N == 0) + return; + + M = vec_len (buckets); + vec_set (buckets, -1); + + vec_validate (next, N - 1); + vec_zero (next); + + while (1) + { + for (i = 0; i < N; i++) + { + u32 c = (permutation[i].offset + next[i] * permutation[i].skip) % M; + while (buckets[c] != (u32) -1) + { + next[i]++; + c = (permutation[i].offset + next[i] * permutation[i].skip) % M; + } + + buckets[c] = permutation[i].index; + next[i]++; + done++; + + if (done == M) + { + vec_free (next); + return; + } + } + } +} + +void +cnat_translation_init_maglev (cnat_translation_t *ct) +{ + cnat_maglev_perm_t *permutations = NULL; + cnat_main_t *cm = &cnat_main; + cnat_ep_trk_t *trk; + u32 backend_index = 0; + + if (vec_len (ct->ct_active_paths) == 0) + return; + + vec_foreach (trk, ct->ct_active_paths) + { + cnat_maglev_perm_t permutation; + u32 h1, h2; + + if (AF_IP4 == ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip)) + { + u32 a, b, c; + a = ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32; + b = (u64) trk->ct_ep[VLIB_TX].ce_port; + c = 0; + hash_v3_mix32 (a, b, c); + hash_v3_finalize32 (a, b, c); + h1 = c; + h2 = b; + } + else + { + u64 a, b, c; + a = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[0]; + b = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[1]; + c = (u64) trk->ct_ep[VLIB_TX].ce_port; + hash_mix64 (a, b, c); + h1 = c; + h2 = b; + } + + permutation.offset = h1 % cm->maglev_len; + permutation.skip = h2 % (cm->maglev_len - 1) + 1; + permutation.index = backend_index++; + + if (trk->ct_flags & CNAT_TRK_FLAG_TEST_DISABLED) + continue; + + vec_add1 (permutations, permutation); + } + + vec_sort_with_function (permutations, cnat_maglev_perm_compare); + + vec_validate (ct->lb_maglev, cm->maglev_len - 1); + + cnat_maglev_shuffle (permutations, ct->lb_maglev); + + vec_free (permutations); +} + +static int +cnat_u32_vec_contains (u32 *v, u32 e) +{ + int i; + + vec_foreach_index (i, v) + if (v[i] == e) + return 1; + + return 0; +} + +static void +cnat_maglev_print_changes (vlib_main_t *vm, u32 *changed_bk_indices, + u32 *old_maglev_lb, u32 *new_maglev_lb) +{ + u32 good_flow_buckets = 0, reset_flow_buckets = 0, stable_to_reset = 0; + u32 reset_to_stable = 0, switched_stable = 0; + for (u32 i = 0; i < vec_len (new_maglev_lb); i++) + { + u8 is_new_changed = + cnat_u32_vec_contains (changed_bk_indices, new_maglev_lb[i]); + u8 is_old_changed = + cnat_u32_vec_contains (changed_bk_indices, old_maglev_lb[i]); + if (new_maglev_lb[i] == old_maglev_lb[i]) + { + if (is_new_changed) + reset_flow_buckets++; + else + good_flow_buckets++; + } + else + { + if (is_new_changed) + stable_to_reset++; + else if (is_old_changed) + reset_to_stable++; + else + switched_stable++; + } + } + vlib_cli_output (vm, + "good B->B:%d | lost A->A':%d A->B:%d ~%0.2f%% | bad " + "B->A':%d B->C:%d ~%0.2f%%", + good_flow_buckets, reset_flow_buckets, reset_to_stable, + (f64) (reset_flow_buckets + reset_to_stable) / + vec_len (new_maglev_lb) * 100.0, + stable_to_reset, switched_stable, + (f64) (stable_to_reset + switched_stable) / + vec_len (new_maglev_lb) * 100.0); +} + +static u8 * +format_cnat_maglev_buckets (u8 *s, va_list *args) +{ + u32 *buckets = va_arg (*args, u32 *); + u32 backend_idx = va_arg (*args, u32); + u32 count = va_arg (*args, u32); + + for (u32 ii = 0; ii < vec_len (buckets); ii++) + if (buckets[ii] == backend_idx) + { + s = format (s, "%d,", ii); + if (--count == 0) + return (s); + } + return (s); +} + +static clib_error_t * +cnat_translation_test_init_maglev (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + cnat_translation_t *trs = 0, *ct; + u64 num_backends = 0, n_tests = 0; + cnat_main_t *cm = &cnat_main; + cnat_ep_trk_t *trk; + u32 rnd; + u32 n_changes = 0, n_remove = 0, verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "tests %d", &n_tests)) + ; + else if (unformat (input, "backends %d", &num_backends)) + ; + else if (unformat (input, "len %d", &cm->maglev_len)) + ; + else if (unformat (input, "change %d", &n_changes)) + ; + else if (unformat (input, "rm %d", &n_remove)) + ; + else if (unformat (input, "verbose %d", &verbose)) + ; + else + return (clib_error_return (0, "unknown input '%U'", + format_unformat_error, input)); + } + + if (num_backends == 0 || n_tests == 0) + return (clib_error_return (0, "No backends / tests to run")); + ; + + vlib_cli_output (vm, "generating random backends..."); + rnd = random_default_seed (); + + vec_validate (trs, n_tests - 1); + vec_foreach (ct, trs) + { + vec_validate (ct->ct_active_paths, num_backends - 1); + vec_foreach (trk, ct->ct_active_paths) + { + trk->ct_flags = 0; + ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip) = AF_IP4; + ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32 = random_u32 (&rnd); + trk->ct_ep[VLIB_TX].ce_port = random_u32 (&rnd); + } + } + + vlib_cli_output (vm, "testing..."); + f64 start_time = vlib_time_now (vm); + vec_foreach (ct, trs) + cnat_translation_init_maglev (ct); + f64 d = vlib_time_now (vm) - start_time; + + vlib_cli_output (vm, "Test took : %U", format_duration, d); + vlib_cli_output (vm, "Per pool : %U", format_duration, d / n_tests); + + /* sanity checking of the output */ + u32 *backend_freqs = 0; + vec_validate (backend_freqs, num_backends - 1); + vec_foreach (ct, trs) + { + if (vec_len (ct->lb_maglev) != cm->maglev_len) + vlib_cli_output (vm, "Unexpected bucket length %d", + vec_len (ct->lb_maglev)); + + vec_zero (backend_freqs); + for (u32 i = 0; i < vec_len (ct->lb_maglev); i++) + { + if (ct->lb_maglev[i] >= num_backends) + clib_warning ("out of bound backend"); + backend_freqs[ct->lb_maglev[i]]++; + } + u32 fmin = ~0, fmax = 0; + for (u32 i = 0; i < num_backends; i++) + { + if (backend_freqs[i] > fmax) + fmax = backend_freqs[i]; + if (backend_freqs[i] < fmin) + fmin = backend_freqs[i]; + } + f64 fdiff = (fmax - fmin); + if (fdiff / vec_len (ct->lb_maglev) - 1 > 0.02) + vlib_cli_output (vm, "More than 2%% frequency diff (min %d max %d)", + fmin, fmax); + } + vec_free (backend_freqs); + + int i = 0; + if (verbose) + vec_foreach (ct, trs) + { + vlib_cli_output (vm, "Translation %d", i++); + for (u32 i = 0; i < verbose; i++) + { + u32 j = random_u32 (&rnd) % vec_len (ct->ct_active_paths); + trk = &ct->ct_active_paths[j]; + vlib_cli_output ( + vm, "[%03d] %U:%d buckets:%U", j, format_ip_address, + &trk->ct_ep[VLIB_TX].ce_ip, trk->ct_ep[VLIB_TX].ce_port, + format_cnat_maglev_buckets, ct->lb_maglev, j, verbose); + } + } + + if (n_remove != 0) + { + vlib_cli_output ( + vm, "Removing %d entries (refered to as A), others (B,C) stay same", + n_remove); + vec_foreach (ct, trs) + { + u32 *old_maglev_lb = 0; + u32 *changed_bk_indices = 0; + if (vec_len (ct->lb_maglev) != cm->maglev_len) + vlib_cli_output (vm, "Unexpected bucket length %d", + vec_len (ct->lb_maglev)); + + vec_validate (changed_bk_indices, n_remove - 1); + for (u32 i = 0; i < n_remove; i++) + { + /* remove n_remove backends from the LB set */ + changed_bk_indices[i] = + random_u32 (&rnd) % vec_len (ct->ct_active_paths); + trk = &ct->ct_active_paths[changed_bk_indices[i]]; + trk->ct_flags |= CNAT_TRK_FLAG_TEST_DISABLED; + } + + old_maglev_lb = vec_dup (ct->lb_maglev); + cnat_translation_init_maglev (ct); + + cnat_maglev_print_changes (vm, changed_bk_indices, old_maglev_lb, + ct->lb_maglev); + + vec_free (changed_bk_indices); + vec_free (old_maglev_lb); + } + } + + /* Reshuffle and check changes */ + if (n_changes != 0) + { + vlib_cli_output ( + vm, + "Changing %d entries (refered to as A->A'), others (B,C) stay same", + n_changes); + vec_foreach (ct, trs) + { + if (vec_len (ct->lb_maglev) != cm->maglev_len) + vlib_cli_output (vm, "Unexpected bucket length %d", + vec_len (ct->lb_maglev)); + + u32 *old_maglev_lb = 0; + u32 *changed_bk_indices = 0; + + vec_validate (changed_bk_indices, n_changes - 1); + for (u32 i = 0; i < n_changes; i++) + { + /* Change n_changes backends in the LB set */ + changed_bk_indices[i] = + random_u32 (&rnd) % vec_len (ct->ct_active_paths); + trk = &ct->ct_active_paths[changed_bk_indices[i]]; + ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32 = + random_u32 (&rnd); + trk->ct_ep[VLIB_TX].ce_port = random_u32 (&rnd) & 0xffff; + } + old_maglev_lb = vec_dup (ct->lb_maglev); + + cnat_translation_init_maglev (ct); + cnat_maglev_print_changes (vm, changed_bk_indices, old_maglev_lb, + ct->lb_maglev); + + vec_free (changed_bk_indices); + vec_free (old_maglev_lb); + } + } + + vec_foreach (ct, trs) + vec_free (ct->ct_active_paths); + vec_free (trs); + + return (NULL); +} + +VLIB_CLI_COMMAND (cnat_translation_test_init_maglev_cmd, static) = { + .path = "test cnat maglev", + .short_help = "test cnat maglev tests [n_tests] backends [num_backends] len " + "[maglev_len]", + .function = cnat_translation_test_init_maglev, +}; diff --git a/src/plugins/cnat/cnat_maglev.h b/src/plugins/cnat/cnat_maglev.h new file mode 100644 index 00000000000..a71dd3ce796 --- /dev/null +++ b/src/plugins/cnat/cnat_maglev.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2022 Cisco Systems, Inc. + */ + +#ifndef __CNAT_MAGLEV_H__ +#define __CNAT_MAGLEV_H__ + +#include +#include + +typedef struct +{ + /* offset & skip used for sorting, should be first */ + u32 offset; + u32 skip; + u32 index; +} cnat_maglev_perm_t; + +extern void cnat_translation_init_maglev (cnat_translation_t *ct); + +#endif \ No newline at end of file diff --git a/src/plugins/cnat/cnat_translation.c b/src/plugins/cnat/cnat_translation.c index 748d531f3de..ee3dc780fea 100644 --- a/src/plugins/cnat/cnat_translation.c +++ b/src/plugins/cnat/cnat_translation.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -200,110 +201,7 @@ cnat_remove_translation_from_db (index_t cci, cnat_endpoint_t * vip, clib_bihash_add_del_8_8 (&cnat_translation_db, &bkey, 0); } -typedef struct -{ - cnat_ep_trk_t *trk; - u32 index; - u32 offset; - u32 skip; -} cnat_maglev_entry_t; - -static int -cnat_maglev_entry_compare (void *_a, void *_b) -{ - cnat_ep_trk_t *a = ((cnat_maglev_entry_t *) _a)->trk; - cnat_ep_trk_t *b = ((cnat_maglev_entry_t *) _b)->trk; - int rv = 0; - if ((rv = - ip_address_cmp (&a->ct_ep[VLIB_TX].ce_ip, &b->ct_ep[VLIB_TX].ce_ip))) - return rv; - if ((rv = a->ct_ep[VLIB_TX].ce_port - a->ct_ep[VLIB_TX].ce_port)) - return rv; - if ((rv = - ip_address_cmp (&a->ct_ep[VLIB_RX].ce_ip, &b->ct_ep[VLIB_RX].ce_ip))) - return rv; - if ((rv = a->ct_ep[VLIB_RX].ce_port - a->ct_ep[VLIB_RX].ce_port)) - return rv; - return 0; -} - -static void -cnat_translation_init_maglev (cnat_translation_t *ct) -{ - cnat_maglev_entry_t *backends = NULL, *bk; - cnat_main_t *cm = &cnat_main; - u32 done = 0; - cnat_ep_trk_t *trk; - int ep_idx = 0; - - vec_foreach (trk, ct->ct_active_paths) - { - cnat_maglev_entry_t bk; - u32 h1, h2; - if (AF_IP4 == ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip)) - { - u32 a, b, c; - a = ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32; - b = (u64) trk->ct_ep[VLIB_TX].ce_port << 16 | - (u64) trk->ct_ep[VLIB_RX].ce_port; - c = ip_addr_v4 (&trk->ct_ep[VLIB_RX].ce_ip).data_u32; - hash_v3_mix32 (a, b, c); - hash_v3_finalize32 (a, b, c); - h1 = c; - h2 = b; - } - else - { - u64 a, b, c; - a = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[0] ^ - ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[1]; - b = (u64) trk->ct_ep[VLIB_TX].ce_port << 16 | - (u64) trk->ct_ep[VLIB_RX].ce_port; - c = ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[0] ^ - ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[1]; - hash_mix64 (a, b, c); - h1 = c; - h2 = b; - } - - bk.offset = h1 % cm->maglev_len; - bk.skip = h2 % (cm->maglev_len - 1) + 1; - bk.index = ep_idx++; - bk.trk = trk; - vec_add1 (backends, bk); - } - - if (0 == ep_idx) - return; - - vec_sort_with_function (backends, cnat_maglev_entry_compare); - - /* Don't free if previous vector exists, just zero */ - vec_validate (ct->lb_maglev, cm->maglev_len); - vec_set (ct->lb_maglev, -1); - - while (1) - { - vec_foreach (bk, backends) - { - u32 next = 0; - u32 c = (bk->offset + next * bk->skip) % cm->maglev_len; - while (ct->lb_maglev[c] != (u32) -1) - { - next++; - c = (bk->offset + next * bk->skip) % cm->maglev_len; - } - ct->lb_maglev[c] = bk->index; - done++; - if (done == cm->maglev_len) - goto finished; - } - } - -finished: - vec_free (backends); -} static void cnat_translation_stack (cnat_translation_t * ct) diff --git a/src/plugins/cnat/cnat_types.h b/src/plugins/cnat/cnat_types.h index c3ec74c345f..7b779a66a4e 100644 --- a/src/plugins/cnat/cnat_types.h +++ b/src/plugins/cnat/cnat_types.h @@ -62,6 +62,8 @@ typedef enum cnat_trk_flag_t_ /* Don't translate this endpoint, but still * forward. Used by maglev for DSR */ CNAT_TRK_FLAG_NO_NAT = (1 << 1), + /* */ + CNAT_TRK_FLAG_TEST_DISABLED = (1 << 7), } cnat_trk_flag_t; typedef enum diff --git a/src/vppinfra/format.h b/src/vppinfra/format.h index d143ecdb6e9..ee47a2099c2 100644 --- a/src/vppinfra/format.h +++ b/src/vppinfra/format.h @@ -98,6 +98,7 @@ _(format_hex_bytes_no_wrap); _(format_white_space); _(format_f64); _(format_time_interval); +_ (format_duration); #ifdef CLIB_UNIX /* Unix specific formats. */ diff --git a/src/vppinfra/std-formats.c b/src/vppinfra/std-formats.c index 421cb3dda99..99ea0c1a713 100644 --- a/src/vppinfra/std-formats.c +++ b/src/vppinfra/std-formats.c @@ -134,6 +134,52 @@ format_white_space (u8 * s, va_list * va) return s; } +u8 * +format_duration (u8 *s, va_list *args) +{ + f64 t = va_arg (*args, f64); + s = format (s, ""); + + const f64 seconds_per_minute = 60; + const f64 seconds_per_hour = 60 * seconds_per_minute; + const f64 seconds_per_day = 24 * seconds_per_hour; + uword days, hours, minutes, secs, msecs, usecs; + + days = t / seconds_per_day; + t -= days * seconds_per_day; + + hours = t / seconds_per_hour; + t -= hours * seconds_per_hour; + + minutes = t / seconds_per_minute; + t -= minutes * seconds_per_minute; + + secs = t; + t -= secs; + + msecs = 1e3 * t; + + usecs = 1e6 * t; + usecs = usecs % 1000; + + if (t == 0.) + s = format (s, "0"); + if (days) + s = format (s, "%ddays ", days); + if (hours) + s = format (s, "%dh ", hours); + if (minutes) + s = format (s, "%dmin ", minutes); + if (secs) + s = format (s, "%ds ", secs); + if (msecs) + s = format (s, "%dms ", msecs); + if (usecs) + s = format (s, "%dus", usecs); + + return (s); +} + u8 * format_time_interval (u8 * s, va_list * args) { -- cgit 1.2.3-korg