diff options
Diffstat (limited to 'src/plugins/cnat')
25 files changed, 1031 insertions, 523 deletions
diff --git a/src/plugins/cnat/CMakeLists.txt b/src/plugins/cnat/CMakeLists.txt index cfb55661a78..e99bf056a35 100644 --- a/src/plugins/cnat/CMakeLists.txt +++ b/src/plugins/cnat/CMakeLists.txt @@ -24,6 +24,7 @@ add_vpp_plugin(cnat cnat_types.c cnat_snat_policy.c cnat_src_policy.c + cnat_maglev.c API_FILES cnat.api diff --git a/src/plugins/cnat/FEATURE.yaml b/src/plugins/cnat/FEATURE.yaml index 9deda2e94cc..880d713b63f 100644 --- a/src/plugins/cnat/FEATURE.yaml +++ b/src/plugins/cnat/FEATURE.yaml @@ -9,7 +9,7 @@ description: "This plugin is intended to complement the VPP's plugin_nat for Cloud use-cases. It allows for source/destination address/port translation based on multiple criterias. It is intended to be modular enough so that one could write a use-case optimised translation function - without having to deal with actually re-writing packets or maintining + without having to deal with actually re-writing packets or maintaining sessions. This plugin supports multithreading. Workers share a unique bihash where sessions are stored." diff --git a/src/plugins/cnat/cnat.api b/src/plugins/cnat/cnat.api index e253084e74e..e6ad37dd6eb 100644 --- a/src/plugins/cnat/cnat.api +++ b/src/plugins/cnat/cnat.api @@ -1,6 +1,6 @@ /* Hey Emacs use -*- mode: C -*- */ /* - * Copyright (c) 2016 Cisco and/or its affiliates. + * Copyright (c) 2023 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: @@ -19,14 +19,16 @@ used to control the ABF plugin */ -option version = "0.2.0"; +option version = "0.3.0"; import "vnet/ip/ip_types.api"; import "vnet/fib/fib_types.api"; import "vnet/interface_types.api"; +import "vnet/ip/ip.api"; enum cnat_translation_flags:u8 { CNAT_TRANSLATION_ALLOC_PORT = 1, + CNAT_TRANSLATION_NO_RETURN_SESSION = 4, }; enum cnat_endpoint_tuple_flags:u8 @@ -70,6 +72,7 @@ typedef cnat_translation u8 flags; vl_api_cnat_lb_type_t lb_type; u32 n_paths; + vl_api_ip_flow_hash_config_v2_t flow_hash_config; vl_api_cnat_endpoint_tuple_t paths[n_paths]; }; @@ -172,6 +175,7 @@ enum cnat_snat_policy_table:u8 CNAT_POLICY_INCLUDE_V4 = 0, CNAT_POLICY_INCLUDE_V6 = 1, CNAT_POLICY_POD = 2, + CNAT_POLICY_HOST = 3, }; autoreply define cnat_snat_policy_add_del_if diff --git a/src/plugins/cnat/cnat.rst b/src/plugins/cnat/cnat.rst index 8781f405a23..b0426f35373 100644 --- a/src/plugins/cnat/cnat.rst +++ b/src/plugins/cnat/cnat.rst @@ -9,7 +9,7 @@ Overview ________ This plugin covers specific NAT use-cases that come mostly -from the container networking world. On the contraty of the +from the container networking world. On the contrary of the NAT concepts used for e.g. a home gateway, there is no notion of 'outside' and 'inside'. We handle Virtual (or Real) IPs and translations of the packets destined to them @@ -33,9 +33,9 @@ that will store the packet rewrite to do and the one to undo until the flow is reset or a timeout is reached A ``session`` is a fully resolved 9-tuple of ``src_ip, src_port, dest_ip, dest_port, proto`` -to match incoming packets, and their new attributes ``new_src_ip, new_src_port, new_dest_ip, new_dest_port``. It allows for ``backend`` stickyness and a fast-path for established connections. +to match incoming packets, and their new attributes ``new_src_ip, new_src_port, new_dest_ip, new_dest_port``. It allows for ``backend`` stickiness and a fast-path for established connections. -These ``sessions`` expire after 30s for regular ``sessions`` and 1h for estabished +These ``sessions`` expire after 30s for regular ``sessions`` and 1h for established TCP connections. These can be changed in vpp's configuration file .. code-block:: console @@ -64,7 +64,7 @@ assigned to an interface If ``30.0.0.2`` is the address of an interface, we can use the following -to do the same translation, and additionnaly change the source. +to do the same translation, and additionally change the source. address with ``1.2.3.4`` .. code-block:: console @@ -75,17 +75,17 @@ To show existing translations and sessions you can use .. code-block:: console - cnat show session verbose - cant show translation + show cnat session verbose + show cnat translation SourceNATing outgoing traffic ----------------------------- -A independant part of the plugin allows changing the source address +A independent part of the plugin allows changing the source address of outgoing traffic on a per-interface basis. -In the following example, all traffic comming from ``tap0`` and NOT +In the following example, all traffic coming from ``tap0`` and NOT going to ``20.0.0.0/24`` will be source NAT-ed with ``30.0.0.1``. On the way back the translation will be undone. @@ -94,10 +94,18 @@ address assigned to an interface) .. code-block:: console - cnat snat with 30.0.0.1 - cnat snat exclude 20.0.0.0/24 + set cnat snat-policy addr 30.0.0.1 + set cnat snat-policy if-pfx + set cnat snat-policy if table include-v4 tap0 + set cnat snat-policy prefix 20.0.0.0/24 set interface feature tap0 cnat-snat-ip4 arc ip4-unicast +To show the enforced snat policies: + +.. code-block:: console + + show cnat snat-policy + Other parameters ---------------- @@ -105,7 +113,7 @@ In vpp's startup file, you can also configure the bihash sizes for * the translation bihash ``(proto, port) -> translation`` * the session bihash ``src_ip, src_port, dest_ip, dest_port, proto -> new_src_ip, new_src_port, new_dest_ip, new_dest_port`` -* the snat bihash for searching ``snat exclude`` prefixes +* the snat bihash for searching ``snat-policy`` excluded prefixes .. code-block:: console @@ -126,19 +134,19 @@ This plugin is built to be extensible. For now two NAT types are defined, ``cnat * Session lookup : ``rv`` will be set to ``0`` if a session was found * Translation primitives ``cnat_translation_ip4`` based on sessions * A session creation primitive ``cnat_session_create`` +* A reverse session creation primitive ``cnat_rsession_create`` -Creating a session will also create a reverse session (for matching return traffic), -and call a NAT node back that will perform the translation. +Creating a session will also create reverse session matching return traffic unless told otherwise by setting ``CNAT_TR_FLAG_NO_RETURN_SESSION`` on the translation. This will call the NAT nodes on the return flow and perform the inverse translation. Known limitations _________________ -This plugin is still under developpment, it lacks the following features : +This plugin is still under development, it lacks the following features : * Load balancing doesn't support parametric probabilities -* VRFs aren't supported. All rules apply to fib table 0 only +* VRFs are not supported, all rules apply regardless of the FIB table. * Programmatic session handling (deletion, lifetime updates) aren't supported -* ICMP is not yet supported -* Traffic matching is only done based on ``(proto, dst_addr, dst_port)`` source matching isn't supported +* translations (i.e. rewriting the destination address) only match on the three +tuple ``(proto, dst_addr, dst_port)`` other matches are not supported * Statistics & session tracking are still rudimentary. diff --git a/src/plugins/cnat/cnat_api.c b/src/plugins/cnat/cnat_api.c index ea4b3aeaaef..c578e303499 100644 --- a/src/plugins/cnat/cnat_api.c +++ b/src/plugins/cnat/cnat_api.c @@ -81,7 +81,7 @@ cnat_endpoint_encode (const cnat_endpoint_t * in, if (in->ce_flags & CNAT_EP_FLAG_RESOLVED) ip_address_encode2 (&in->ce_ip, &out->addr); else - clib_memset ((void *) &in->ce_ip, 0, sizeof (in->ce_ip)); + clib_memset (&out->addr, 0, sizeof (out->addr)); } static void @@ -97,6 +97,7 @@ vl_api_cnat_translation_update_t_handler (vl_api_cnat_translation_update_t int rv = 0; u32 pi, n_paths; cnat_lb_type_t lb_type; + flow_hash_config_t flow_hash_config = 0; rv = ip_proto_decode (mp->translation.ip_proto, &ip_proto); @@ -123,7 +124,10 @@ vl_api_cnat_translation_update_t_handler (vl_api_cnat_translation_update_t flags |= CNAT_FLAG_EXCLUSIVE; lb_type = (cnat_lb_type_t) mp->translation.lb_type; - id = cnat_translation_update (&vip, ip_proto, paths, flags, lb_type); + flow_hash_config = (flow_hash_config_t) clib_net_to_host_u32 ( + mp->translation.flow_hash_config); + id = cnat_translation_update (&vip, ip_proto, paths, flags, lb_type, + flow_hash_config); vec_free (paths); diff --git a/src/plugins/cnat/cnat_bihash.h b/src/plugins/cnat/cnat_bihash.h index c488e61a07d..75099f6bfdb 100644 --- a/src/plugins/cnat/cnat_bihash.h +++ b/src/plugins/cnat/cnat_bihash.h @@ -44,11 +44,16 @@ typedef struct u64 value[7]; } clib_bihash_kv_40_56_t; +static inline void +clib_bihash_mark_free_40_56 (clib_bihash_kv_40_56_t *v) +{ + v->value[0] = 0xFEEDFACE8BADF00DULL; +} + static inline int clib_bihash_is_free_40_56 (const clib_bihash_kv_40_56_t *v) { - /* Free values are clib_memset to 0xff, check a bit... */ - if (v->key[0] == ~0ULL && v->value[0] == ~0ULL) + if (v->value[0] == 0xFEEDFACE8BADF00DULL) return 1; return 0; } diff --git a/src/plugins/cnat/cnat_client.c b/src/plugins/cnat/cnat_client.c index b8fcb9add64..a28896a4c12 100644 --- a/src/plugins/cnat/cnat_client.c +++ b/src/plugins/cnat/cnat_client.c @@ -20,10 +20,9 @@ #include <cnat/cnat_translation.h> cnat_client_t *cnat_client_pool; - cnat_client_db_t cnat_client_db; - dpo_type_t cnat_client_dpo; +fib_source_t cnat_fib_source; static_always_inline u8 cnat_client_is_clone (cnat_client_t * cc) @@ -34,10 +33,42 @@ cnat_client_is_clone (cnat_client_t * cc) static void cnat_client_db_remove (cnat_client_t * cc) { + clib_bihash_kv_16_8_t bkey; + if (ip_addr_version (&cc->cc_ip) == AF_IP4) + { + bkey.key[0] = ip_addr_v4 (&cc->cc_ip).as_u32; + bkey.key[1] = 0; + } + else + { + bkey.key[0] = ip_addr_v6 (&cc->cc_ip).as_u64[0]; + bkey.key[1] = ip_addr_v6 (&cc->cc_ip).as_u64[1]; + } + + clib_bihash_add_del_16_8 (&cnat_client_db.cc_ip_id_hash, &bkey, 0 /* del */); +} + +static void +cnat_client_db_add (cnat_client_t *cc) +{ + index_t cci; + + cci = cc - cnat_client_pool; + + clib_bihash_kv_16_8_t bkey; + bkey.value = cci; if (ip_addr_version (&cc->cc_ip) == AF_IP4) - hash_unset (cnat_client_db.crd_cip4, ip_addr_v4 (&cc->cc_ip).as_u32); + { + bkey.key[0] = ip_addr_v4 (&cc->cc_ip).as_u32; + bkey.key[1] = 0; + } else - hash_unset_mem_free (&cnat_client_db.crd_cip6, &ip_addr_v6 (&cc->cc_ip)); + { + bkey.key[0] = ip_addr_v6 (&cc->cc_ip).as_u64[0]; + bkey.key[1] = ip_addr_v6 (&cc->cc_ip).as_u64[1]; + } + + clib_bihash_add_del_16_8 (&cnat_client_db.cc_ip_id_hash, &bkey, 1 /* add */); } static void @@ -118,21 +149,6 @@ cnat_client_translation_deleted (index_t cci) cnat_client_destroy (cc); } -static void -cnat_client_db_add (cnat_client_t * cc) -{ - index_t cci; - - cci = cc - cnat_client_pool; - - if (ip_addr_version (&cc->cc_ip) == AF_IP4) - hash_set (cnat_client_db.crd_cip4, ip_addr_v4 (&cc->cc_ip).as_u32, cci); - else - hash_set_mem_alloc (&cnat_client_db.crd_cip6, - &ip_addr_v6 (&cc->cc_ip), cci); -} - - index_t cnat_client_add (const ip_address_t * ip, u8 flags) { @@ -228,12 +244,6 @@ int cnat_client_purge (void) { int rv = 0, rrv = 0; - if ((rv = hash_elts (cnat_client_db.crd_cip6))) - clib_warning ("len(crd_cip6) isnt 0 but %d", rv); - rrv |= rv; - if ((rv = hash_elts (cnat_client_db.crd_cip4))) - clib_warning ("len(crd_cip4) isnt 0 but %d", rv); - rrv |= rv; if ((rv = pool_elts (cnat_client_pool))) clib_warning ("len(cnat_client_pool) isnt 0 but %d", rv); rrv |= rv; @@ -251,9 +261,9 @@ format_cnat_client (u8 * s, va_list * args) cnat_client_t *cc = pool_elt_at_index (cnat_client_pool, cci); - s = format (s, "[%d] cnat-client:[%U] tr:%d sess:%d", cci, - format_ip_address, &cc->cc_ip, - cc->tr_refcnt, cc->session_refcnt); + s = format (s, "[%d] cnat-client:[%U] tr:%d sess:%d locks:%u", cci, + format_ip_address, &cc->cc_ip, cc->tr_refcnt, cc->session_refcnt, + cc->cc_locks); if (cc->flags & CNAT_FLAG_EXCLUSIVE) s = format (s, " exclusive"); @@ -291,7 +301,6 @@ cnat_client_show (vlib_main_t * vm, vlib_cli_output(vm, "%U", format_cnat_client, cci, 0); vlib_cli_output (vm, "%d clients", pool_elts (cnat_client_pool)); - vlib_cli_output (vm, "%d timestamps", pool_elts (cnat_timestamps)); } else { @@ -371,12 +380,15 @@ const static dpo_vft_t cnat_client_dpo_vft = { static clib_error_t * cnat_client_init (vlib_main_t * vm) { + cnat_main_t *cm = &cnat_main; cnat_client_dpo = dpo_register_new_type (&cnat_client_dpo_vft, cnat_client_dpo_nodes); - cnat_client_db.crd_cip6 = hash_create_mem (0, - sizeof (ip6_address_t), - sizeof (uword)); + clib_bihash_init_16_8 (&cnat_client_db.cc_ip_id_hash, "CNat client DB", + cm->client_hash_buckets, cm->client_hash_memory); + + cnat_fib_source = fib_source_allocate ("cnat", CNAT_FIB_SOURCE_PRIORITY, + FIB_SOURCE_BH_SIMPLE); clib_spinlock_init (&cnat_client_db.throttle_lock); cnat_client_db.throttle_mem = diff --git a/src/plugins/cnat/cnat_client.h b/src/plugins/cnat/cnat_client.h index d6e3631d868..4dc6b754b2f 100644 --- a/src/plugins/cnat/cnat_client.h +++ b/src/plugins/cnat/cnat_client.h @@ -17,6 +17,7 @@ #define __CNAT_CLIENT_H__ #include <cnat/cnat_types.h> +#include <vppinfra/bihash_16_8.h> /** * A client is a representation of an IP address behind the NAT. @@ -85,8 +86,6 @@ extern void cnat_client_free_by_ip (ip46_address_t * addr, u8 af); extern cnat_client_t *cnat_client_pool; extern dpo_type_t cnat_client_dpo; -#define CC_INDEX_INVALID ((u32)(~0)) - static_always_inline cnat_client_t * cnat_client_get (index_t i) { @@ -132,8 +131,7 @@ extern void cnat_client_throttle_pool_process (); */ typedef struct cnat_client_db_t_ { - uword *crd_cip4; - uword *crd_cip6; + clib_bihash_16_8_t cc_ip_id_hash; /* Pool of addresses that have been throttled and need to be refcounted before calling cnat_client_free_by_ip */ @@ -149,27 +147,15 @@ extern cnat_client_db_t cnat_client_db; static_always_inline cnat_client_t * cnat_client_ip4_find (const ip4_address_t * ip) { - uword *p; - - p = hash_get (cnat_client_db.crd_cip4, ip->as_u32); - - if (p) - return (pool_elt_at_index (cnat_client_pool, p[0])); - - return (NULL); -} - -static_always_inline u32 -cnat_client_ip4_find_index (const ip4_address_t * ip) -{ - uword *p; + clib_bihash_kv_16_8_t bkey, bval; - p = hash_get (cnat_client_db.crd_cip4, ip->as_u32); + bkey.key[0] = ip->as_u32; + bkey.key[1] = 0; - if (p) - return p[0]; + if (clib_bihash_search_16_8 (&cnat_client_db.cc_ip_id_hash, &bkey, &bval)) + return (NULL); - return -1; + return (pool_elt_at_index (cnat_client_pool, bval.value)); } /** @@ -178,14 +164,15 @@ cnat_client_ip4_find_index (const ip4_address_t * ip) static_always_inline cnat_client_t * cnat_client_ip6_find (const ip6_address_t * ip) { - uword *p; + clib_bihash_kv_16_8_t bkey, bval; - p = hash_get_mem (cnat_client_db.crd_cip6, ip); + bkey.key[0] = ip->as_u64[0]; + bkey.key[1] = ip->as_u64[1]; - if (p) - return (pool_elt_at_index (cnat_client_pool, p[0])); + if (clib_bihash_search_16_8 (&cnat_client_db.cc_ip_id_hash, &bkey, &bval)) + return (NULL); - return (NULL); + return (pool_elt_at_index (cnat_client_pool, bval.value)); } /** diff --git a/src/plugins/cnat/cnat_inline.h b/src/plugins/cnat/cnat_inline.h index 5a55ecbf3c0..2986b3497a9 100644 --- a/src/plugins/cnat/cnat_inline.h +++ b/src/plugins/cnat/cnat_inline.h @@ -19,72 +19,122 @@ #include <cnat/cnat_types.h> +always_inline int +cnat_ts_is_free_index (u32 index) +{ + u32 pidx = index >> (32 - CNAT_TS_MPOOL_BITS); + index = index & (0xffffffff >> CNAT_TS_MPOOL_BITS); + return pool_is_free_index (cnat_timestamps.ts_pools[pidx], index); +} + +always_inline cnat_timestamp_t * +cnat_timestamp_get (u32 index) +{ + /* 6 top bits for choosing pool */ + u32 pidx = index >> (32 - CNAT_TS_MPOOL_BITS); + index = index & (0xffffffff >> CNAT_TS_MPOOL_BITS); + return pool_elt_at_index (cnat_timestamps.ts_pools[pidx], index); +} + +always_inline cnat_timestamp_t * +cnat_timestamp_get_if_valid (u32 index) +{ + /* 6 top bits for choosing pool */ + u32 pidx = index >> (32 - CNAT_TS_MPOOL_BITS); + index = index & (0xffffffff >> CNAT_TS_MPOOL_BITS); + if (pidx >= cnat_timestamps.next_empty_pool_idx) + return (NULL); + if (pool_is_free_index (cnat_timestamps.ts_pools[pidx], index)) + return (NULL); + return pool_elt_at_index (cnat_timestamps.ts_pools[pidx], index); +} + +always_inline index_t +cnat_timestamp_alloc () +{ + cnat_timestamp_t *ts; + u32 index, pool_sz; + uword pidx; + + clib_spinlock_lock (&cnat_timestamps.ts_lock); + pidx = clib_bitmap_first_set (cnat_timestamps.ts_free); + pool_sz = 1 << (CNAT_TS_BASE_SIZE + pidx); + ASSERT (pidx <= cnat_timestamps.next_empty_pool_idx); + if (pidx == cnat_timestamps.next_empty_pool_idx) + pool_init_fixed ( + cnat_timestamps.ts_pools[cnat_timestamps.next_empty_pool_idx++], + pool_sz); + pool_get (cnat_timestamps.ts_pools[pidx], ts); + if (pool_elts (cnat_timestamps.ts_pools[pidx]) == pool_sz) + clib_bitmap_set (cnat_timestamps.ts_free, pidx, 0); + clib_spinlock_unlock (&cnat_timestamps.ts_lock); + + index = (u32) pidx << (32 - CNAT_TS_MPOOL_BITS); + return index | (ts - cnat_timestamps.ts_pools[pidx]); +} + +always_inline void +cnat_timestamp_destroy (u32 index) +{ + u32 pidx = index >> (32 - CNAT_TS_MPOOL_BITS); + index = index & (0xffffffff >> CNAT_TS_MPOOL_BITS); + clib_spinlock_lock (&cnat_timestamps.ts_lock); + pool_put_index (cnat_timestamps.ts_pools[pidx], index); + clib_bitmap_set (cnat_timestamps.ts_free, pidx, 1); + clib_spinlock_unlock (&cnat_timestamps.ts_lock); +} + always_inline u32 cnat_timestamp_new (f64 t) { - u32 index; - cnat_timestamp_t *ts; - clib_rwlock_writer_lock (&cnat_main.ts_lock); - pool_get (cnat_timestamps, ts); + index_t index = cnat_timestamp_alloc (); + cnat_timestamp_t *ts = cnat_timestamp_get (index); ts->last_seen = t; ts->lifetime = cnat_main.session_max_age; ts->refcnt = CNAT_TIMESTAMP_INIT_REFCNT; - index = ts - cnat_timestamps; - clib_rwlock_writer_unlock (&cnat_main.ts_lock); return index; } always_inline void cnat_timestamp_inc_refcnt (u32 index) { - clib_rwlock_reader_lock (&cnat_main.ts_lock); - cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index); - ts->refcnt++; - clib_rwlock_reader_unlock (&cnat_main.ts_lock); + cnat_timestamp_t *ts = cnat_timestamp_get (index); + clib_atomic_add_fetch (&ts->refcnt, 1); } always_inline void cnat_timestamp_update (u32 index, f64 t) { - clib_rwlock_reader_lock (&cnat_main.ts_lock); - cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index); + cnat_timestamp_t *ts = cnat_timestamp_get (index); ts->last_seen = t; - clib_rwlock_reader_unlock (&cnat_main.ts_lock); } always_inline void cnat_timestamp_set_lifetime (u32 index, u16 lifetime) { - clib_rwlock_reader_lock (&cnat_main.ts_lock); - cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index); + cnat_timestamp_t *ts = cnat_timestamp_get (index); ts->lifetime = lifetime; - clib_rwlock_reader_unlock (&cnat_main.ts_lock); } always_inline f64 cnat_timestamp_exp (u32 index) { f64 t; - if (INDEX_INVALID == index) + cnat_timestamp_t *ts = cnat_timestamp_get_if_valid (index); + if (NULL == ts) return -1; - clib_rwlock_reader_lock (&cnat_main.ts_lock); - cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index); t = ts->last_seen + (f64) ts->lifetime; - clib_rwlock_reader_unlock (&cnat_main.ts_lock); return t; } always_inline void cnat_timestamp_free (u32 index) { - if (INDEX_INVALID == index) + cnat_timestamp_t *ts = cnat_timestamp_get_if_valid (index); + if (NULL == ts) return; - clib_rwlock_writer_lock (&cnat_main.ts_lock); - cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index); - ts->refcnt--; - if (0 == ts->refcnt) - pool_put (cnat_timestamps, ts); - clib_rwlock_writer_unlock (&cnat_main.ts_lock); + if (0 == clib_atomic_sub_fetch (&ts->refcnt, 1)) + cnat_timestamp_destroy (index); } /* diff --git a/src/plugins/cnat/cnat_maglev.c b/src/plugins/cnat/cnat_maglev.c new file mode 100644 index 00000000000..2cdb868b3d7 --- /dev/null +++ b/src/plugins/cnat/cnat_maglev.c @@ -0,0 +1,379 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2022 Cisco Systems, Inc. + */ + +#include <cnat/cnat_maglev.h> + +static int +cnat_maglev_perm_compare (void *_a, void *_b) +{ + return *(u64 *) _b - *(u64 *) _a; +} + +/** + * Maglev algorithm implementation. This takes permutation as input, + * with the values of offset & skip for the backends. + * It fills buckets matching the permuntations, provided buckets is + * already of length at least M + */ +static void +cnat_maglev_shuffle (cnat_maglev_perm_t *permutation, u32 *buckets) +{ + u32 N, M, i, done = 0; + u32 *next = 0; + + N = vec_len (permutation); + if (N == 0) + return; + + M = vec_len (buckets); + if (M == 0) + return; + vec_set (buckets, -1); + + vec_validate (next, N - 1); + vec_zero (next); + + while (1) + { + for (i = 0; i < N; i++) + { + u32 c = (permutation[i].offset + next[i] * permutation[i].skip) % M; + while (buckets[c] != (u32) -1) + { + next[i]++; + c = (permutation[i].offset + next[i] * permutation[i].skip) % M; + } + + buckets[c] = permutation[i].index; + next[i]++; + done++; + + if (done == M) + { + vec_free (next); + return; + } + } + } +} + +void +cnat_translation_init_maglev (cnat_translation_t *ct) +{ + cnat_maglev_perm_t *permutations = NULL; + cnat_main_t *cm = &cnat_main; + cnat_ep_trk_t *trk; + u32 backend_index = 0; + + if (vec_len (ct->ct_active_paths) == 0) + return; + + vec_foreach (trk, ct->ct_active_paths) + { + cnat_maglev_perm_t permutation; + u32 h1, h2; + + if (AF_IP4 == ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip)) + { + u32 a, b, c; + a = ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32; + b = (u64) trk->ct_ep[VLIB_TX].ce_port; + c = 0; + hash_v3_mix32 (a, b, c); + hash_v3_finalize32 (a, b, c); + h1 = c; + h2 = b; + } + else + { + u64 a, b, c; + a = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[0]; + b = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[1]; + c = (u64) trk->ct_ep[VLIB_TX].ce_port; + hash_mix64 (a, b, c); + h1 = c; + h2 = b; + } + + permutation.offset = h1 % cm->maglev_len; + permutation.skip = h2 % (cm->maglev_len - 1) + 1; + permutation.index = backend_index++; + + if (trk->ct_flags & CNAT_TRK_FLAG_TEST_DISABLED) + continue; + + vec_add1 (permutations, permutation); + } + + vec_sort_with_function (permutations, cnat_maglev_perm_compare); + + vec_validate (ct->lb_maglev, cm->maglev_len - 1); + + cnat_maglev_shuffle (permutations, ct->lb_maglev); + + vec_free (permutations); +} + +static int +cnat_u32_vec_contains (u32 *v, u32 e) +{ + int i; + + vec_foreach_index (i, v) + if (v[i] == e) + return 1; + + return 0; +} + +static void +cnat_maglev_print_changes (vlib_main_t *vm, u32 *changed_bk_indices, + u32 *old_maglev_lb, u32 *new_maglev_lb) +{ + u32 good_flow_buckets = 0, reset_flow_buckets = 0, stable_to_reset = 0; + u32 reset_to_stable = 0, switched_stable = 0; + if (vec_len (new_maglev_lb) == 0) + return; + for (u32 i = 0; i < vec_len (new_maglev_lb); i++) + { + u8 is_new_changed = + cnat_u32_vec_contains (changed_bk_indices, new_maglev_lb[i]); + u8 is_old_changed = + cnat_u32_vec_contains (changed_bk_indices, old_maglev_lb[i]); + if (new_maglev_lb[i] == old_maglev_lb[i]) + { + if (is_new_changed) + reset_flow_buckets++; + else + good_flow_buckets++; + } + else + { + if (is_new_changed) + stable_to_reset++; + else if (is_old_changed) + reset_to_stable++; + else + switched_stable++; + } + } + vlib_cli_output (vm, + "good B->B:%d | lost A->A':%d A->B:%d ~%0.2f%% | bad " + "B->A':%d B->C:%d ~%0.2f%%", + good_flow_buckets, reset_flow_buckets, reset_to_stable, + (f64) (reset_flow_buckets + reset_to_stable) / + vec_len (new_maglev_lb) * 100.0, + stable_to_reset, switched_stable, + (f64) (stable_to_reset + switched_stable) / + vec_len (new_maglev_lb) * 100.0); +} + +static u8 * +format_cnat_maglev_buckets (u8 *s, va_list *args) +{ + u32 *buckets = va_arg (*args, u32 *); + u32 backend_idx = va_arg (*args, u32); + u32 count = va_arg (*args, u32); + + for (u32 ii = 0; ii < vec_len (buckets); ii++) + if (buckets[ii] == backend_idx) + { + s = format (s, "%d,", ii); + if (--count == 0) + return (s); + } + return (s); +} + +static clib_error_t * +cnat_translation_test_init_maglev (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + cnat_translation_t *trs = 0, *ct; + u64 num_backends = 0, n_tests = 0; + cnat_main_t *cm = &cnat_main; + cnat_ep_trk_t *trk; + u32 rnd; + u32 n_changes = 0, n_remove = 0, verbose = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "tests %d", &n_tests)) + ; + else if (unformat (input, "backends %d", &num_backends)) + ; + else if (unformat (input, "len %d", &cm->maglev_len)) + ; + else if (unformat (input, "change %d", &n_changes)) + ; + else if (unformat (input, "rm %d", &n_remove)) + ; + else if (unformat (input, "verbose %d", &verbose)) + ; + else + return (clib_error_return (0, "unknown input '%U'", + format_unformat_error, input)); + } + + if (num_backends == 0 || n_tests == 0) + return (clib_error_return (0, "No backends / tests to run")); + ; + + vlib_cli_output (vm, "generating random backends..."); + rnd = random_default_seed (); + + vec_validate (trs, n_tests - 1); + vec_foreach (ct, trs) + { + vec_validate (ct->ct_active_paths, num_backends - 1); + vec_foreach (trk, ct->ct_active_paths) + { + trk->ct_flags = 0; + ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip) = AF_IP4; + ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32 = random_u32 (&rnd); + trk->ct_ep[VLIB_TX].ce_port = random_u32 (&rnd); + } + } + + vlib_cli_output (vm, "testing..."); + f64 start_time = vlib_time_now (vm); + vec_foreach (ct, trs) + cnat_translation_init_maglev (ct); + f64 d = vlib_time_now (vm) - start_time; + + vlib_cli_output (vm, "Test took : %U", format_duration, d); + vlib_cli_output (vm, "Per pool : %U", format_duration, d / n_tests); + + /* sanity checking of the output */ + u32 *backend_freqs = 0; + vec_validate (backend_freqs, num_backends - 1); + vec_foreach (ct, trs) + { + if (vec_len (ct->lb_maglev) != cm->maglev_len) + vlib_cli_output (vm, "Unexpected bucket length %d", + vec_len (ct->lb_maglev)); + + vec_zero (backend_freqs); + for (u32 i = 0; i < vec_len (ct->lb_maglev); i++) + { + if (ct->lb_maglev[i] >= num_backends) + clib_warning ("out of bound backend"); + backend_freqs[ct->lb_maglev[i]]++; + } + u32 fmin = ~0, fmax = 0; + for (u32 i = 0; i < num_backends; i++) + { + if (backend_freqs[i] > fmax) + fmax = backend_freqs[i]; + if (backend_freqs[i] < fmin) + fmin = backend_freqs[i]; + } + f64 fdiff = (fmax - fmin); + if (fdiff / vec_len (ct->lb_maglev) - 1 > 0.02) + vlib_cli_output (vm, "More than 2%% frequency diff (min %d max %d)", + fmin, fmax); + } + vec_free (backend_freqs); + + int i = 0; + if (verbose) + vec_foreach (ct, trs) + { + vlib_cli_output (vm, "Translation %d", i++); + for (u32 i = 0; i < verbose; i++) + { + u32 j = random_u32 (&rnd) % vec_len (ct->ct_active_paths); + trk = &ct->ct_active_paths[j]; + vlib_cli_output ( + vm, "[%03d] %U:%d buckets:%U", j, format_ip_address, + &trk->ct_ep[VLIB_TX].ce_ip, trk->ct_ep[VLIB_TX].ce_port, + format_cnat_maglev_buckets, ct->lb_maglev, j, verbose); + } + } + + if (n_remove != 0) + { + vlib_cli_output ( + vm, "Removing %d entries (refered to as A), others (B,C) stay same", + n_remove); + vec_foreach (ct, trs) + { + u32 *old_maglev_lb = 0; + u32 *changed_bk_indices = 0; + if (vec_len (ct->lb_maglev) != cm->maglev_len) + vlib_cli_output (vm, "Unexpected bucket length %d", + vec_len (ct->lb_maglev)); + + vec_validate (changed_bk_indices, n_remove - 1); + for (u32 i = 0; i < n_remove; i++) + { + /* remove n_remove backends from the LB set */ + changed_bk_indices[i] = + random_u32 (&rnd) % vec_len (ct->ct_active_paths); + trk = &ct->ct_active_paths[changed_bk_indices[i]]; + trk->ct_flags |= CNAT_TRK_FLAG_TEST_DISABLED; + } + + old_maglev_lb = vec_dup (ct->lb_maglev); + cnat_translation_init_maglev (ct); + + cnat_maglev_print_changes (vm, changed_bk_indices, old_maglev_lb, + ct->lb_maglev); + + vec_free (changed_bk_indices); + vec_free (old_maglev_lb); + } + } + + /* Reshuffle and check changes */ + if (n_changes != 0) + { + vlib_cli_output ( + vm, + "Changing %d entries (refered to as A->A'), others (B,C) stay same", + n_changes); + vec_foreach (ct, trs) + { + if (vec_len (ct->lb_maglev) != cm->maglev_len) + vlib_cli_output (vm, "Unexpected bucket length %d", + vec_len (ct->lb_maglev)); + + u32 *old_maglev_lb = 0; + u32 *changed_bk_indices = 0; + + vec_validate (changed_bk_indices, n_changes - 1); + for (u32 i = 0; i < n_changes; i++) + { + /* Change n_changes backends in the LB set */ + changed_bk_indices[i] = + random_u32 (&rnd) % vec_len (ct->ct_active_paths); + trk = &ct->ct_active_paths[changed_bk_indices[i]]; + ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32 = + random_u32 (&rnd); + trk->ct_ep[VLIB_TX].ce_port = random_u32 (&rnd) & 0xffff; + } + old_maglev_lb = vec_dup (ct->lb_maglev); + + cnat_translation_init_maglev (ct); + cnat_maglev_print_changes (vm, changed_bk_indices, old_maglev_lb, + ct->lb_maglev); + + vec_free (changed_bk_indices); + vec_free (old_maglev_lb); + } + } + + vec_foreach (ct, trs) + vec_free (ct->ct_active_paths); + vec_free (trs); + + return (NULL); +} + +VLIB_CLI_COMMAND (cnat_translation_test_init_maglev_cmd, static) = { + .path = "test cnat maglev", + .short_help = "test cnat maglev tests [n_tests] backends [num_backends] len " + "[maglev_len]", + .function = cnat_translation_test_init_maglev, +}; diff --git a/src/plugins/cnat/cnat_maglev.h b/src/plugins/cnat/cnat_maglev.h new file mode 100644 index 00000000000..a71dd3ce796 --- /dev/null +++ b/src/plugins/cnat/cnat_maglev.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright(c) 2022 Cisco Systems, Inc. + */ + +#ifndef __CNAT_MAGLEV_H__ +#define __CNAT_MAGLEV_H__ + +#include <cnat/cnat_types.h> +#include <cnat/cnat_translation.h> + +typedef struct +{ + /* offset & skip used for sorting, should be first */ + u32 offset; + u32 skip; + u32 index; +} cnat_maglev_perm_t; + +extern void cnat_translation_init_maglev (cnat_translation_t *ct); + +#endif
\ No newline at end of file diff --git a/src/plugins/cnat/cnat_node.h b/src/plugins/cnat/cnat_node.h index 246fdb8ba57..d81f6745bc4 100644 --- a/src/plugins/cnat/cnat_node.h +++ b/src/plugins/cnat/cnat_node.h @@ -19,6 +19,7 @@ #include <vlibmemory/api.h> #include <vnet/dpo/load_balance.h> #include <vnet/dpo/load_balance_map.h> +#include <vnet/ip/ip_psh_cksum.h> #include <cnat/cnat_session.h> #include <cnat/cnat_client.h> @@ -169,86 +170,92 @@ cmp_ip6_address (const ip6_address_t * a1, const ip6_address_t * a2) * Inline translation functions */ -static_always_inline u8 -has_ip6_address (ip6_address_t * a) +static_always_inline u16 +ip4_pseudo_header_cksum2 (ip4_header_t *ip4, ip4_address_t address[VLIB_N_DIR]) { - return ((0 != a->as_u64[0]) || (0 != a->as_u64[1])); + ip4_psh_t psh = { 0 }; + psh.src = address[VLIB_RX]; + psh.dst = address[VLIB_TX]; + psh.proto = ip4->protocol; + psh.l4len = clib_host_to_net_u16 (clib_net_to_host_u16 (ip4->length) - + sizeof (ip4_header_t)); + return ~(clib_ip_csum ((u8 *) &psh, sizeof (ip4_psh_t))); } static_always_inline void -cnat_ip4_translate_l4 (ip4_header_t * ip4, udp_header_t * udp, - ip_csum_t * sum, +cnat_ip4_translate_l4 (ip4_header_t *ip4, udp_header_t *udp, ip_csum_t *sum, ip4_address_t new_addr[VLIB_N_DIR], - u16 new_port[VLIB_N_DIR]) + u16 new_port[VLIB_N_DIR], u32 oflags) { u16 old_port[VLIB_N_DIR]; - ip4_address_t old_addr[VLIB_N_DIR]; + old_port[VLIB_TX] = udp->dst_port; + old_port[VLIB_RX] = udp->src_port; - /* Fastpath no checksum */ - if (PREDICT_TRUE (0 == *sum)) + udp->dst_port = new_port[VLIB_TX]; + udp->src_port = new_port[VLIB_RX]; + + if (oflags & + (VNET_BUFFER_OFFLOAD_F_TCP_CKSUM | VNET_BUFFER_OFFLOAD_F_UDP_CKSUM)) { - udp->dst_port = new_port[VLIB_TX]; - udp->src_port = new_port[VLIB_RX]; + *sum = ip4_pseudo_header_cksum2 (ip4, new_addr); return; } - old_port[VLIB_TX] = udp->dst_port; - old_port[VLIB_RX] = udp->src_port; - old_addr[VLIB_TX] = ip4->dst_address; - old_addr[VLIB_RX] = ip4->src_address; + *sum = ip_csum_update (*sum, ip4->dst_address.as_u32, + new_addr[VLIB_TX].as_u32, ip4_header_t, dst_address); + *sum = ip_csum_update (*sum, ip4->src_address.as_u32, + new_addr[VLIB_RX].as_u32, ip4_header_t, src_address); - if (new_addr[VLIB_TX].as_u32) + *sum = ip_csum_update (*sum, old_port[VLIB_TX], new_port[VLIB_TX], + udp_header_t, dst_port); + *sum = ip_csum_update (*sum, old_port[VLIB_RX], new_port[VLIB_RX], + udp_header_t, src_port); +} + +static_always_inline void +cnat_ip4_translate_sctp (ip4_header_t *ip4, sctp_header_t *sctp, + u16 new_port[VLIB_N_DIR]) +{ + /* Fastpath no checksum */ + if (PREDICT_TRUE (0 == sctp->checksum)) { - *sum = - ip_csum_update (*sum, old_addr[VLIB_TX].as_u32, - new_addr[VLIB_TX].as_u32, ip4_header_t, dst_address); + sctp->dst_port = new_port[VLIB_TX]; + sctp->src_port = new_port[VLIB_RX]; + return; } + if (new_port[VLIB_TX]) - { - udp->dst_port = new_port[VLIB_TX]; - *sum = ip_csum_update (*sum, old_port[VLIB_TX], new_port[VLIB_TX], - ip4_header_t /* cheat */ , - length /* changed member */ ); - } - if (new_addr[VLIB_RX].as_u32) - { - *sum = - ip_csum_update (*sum, old_addr[VLIB_RX].as_u32, - new_addr[VLIB_RX].as_u32, ip4_header_t, src_address); - } + sctp->dst_port = new_port[VLIB_TX]; if (new_port[VLIB_RX]) - { - udp->src_port = new_port[VLIB_RX]; - *sum = ip_csum_update (*sum, old_port[VLIB_RX], new_port[VLIB_RX], - ip4_header_t /* cheat */ , - length /* changed member */ ); - } + sctp->src_port = new_port[VLIB_RX]; + + sctp->checksum = 0; + sctp->checksum = clib_host_to_little_u32 (~clib_crc32c_with_init ( + (u8 *) sctp, ntohs (ip4->length) - sizeof (ip4_header_t), + ~0 /* init value */)); } static_always_inline void -cnat_ip4_translate_l3 (ip4_header_t * ip4, ip4_address_t new_addr[VLIB_N_DIR]) +cnat_ip4_translate_l3 (ip4_header_t *ip4, ip4_address_t new_addr[VLIB_N_DIR], + u32 oflags) { ip4_address_t old_addr[VLIB_N_DIR]; ip_csum_t sum; - old_addr[VLIB_TX] = ip4->dst_address; old_addr[VLIB_RX] = ip4->src_address; + ip4->dst_address = new_addr[VLIB_TX]; + ip4->src_address = new_addr[VLIB_RX]; + + // We always compute the IP checksum even if oflags & + // VNET_BUFFER_OFFLOAD_F_IP_CKSUM is set as this is relatively inexpensive + // and will allow avoiding issues in driver that do not behave properly + // downstream. sum = ip4->checksum; - if (new_addr[VLIB_TX].as_u32) - { - ip4->dst_address = new_addr[VLIB_TX]; - sum = - ip_csum_update (sum, old_addr[VLIB_TX].as_u32, + sum = ip_csum_update (sum, old_addr[VLIB_TX].as_u32, new_addr[VLIB_TX].as_u32, ip4_header_t, dst_address); - } - if (new_addr[VLIB_RX].as_u32) - { - ip4->src_address = new_addr[VLIB_RX]; - sum = - ip_csum_update (sum, old_addr[VLIB_RX].as_u32, + sum = ip_csum_update (sum, old_addr[VLIB_RX].as_u32, new_addr[VLIB_RX].as_u32, ip4_header_t, src_address); - } ip4->checksum = ip_csum_fold (sum); } @@ -257,48 +264,40 @@ cnat_tcp_update_session_lifetime (tcp_header_t * tcp, u32 index) { cnat_main_t *cm = &cnat_main; if (PREDICT_FALSE (tcp_fin (tcp))) - { - cnat_timestamp_set_lifetime (index, CNAT_DEFAULT_TCP_RST_TIMEOUT); - } + cnat_timestamp_set_lifetime (index, CNAT_DEFAULT_TCP_RST_TIMEOUT); if (PREDICT_FALSE (tcp_rst (tcp))) - { - cnat_timestamp_set_lifetime (index, CNAT_DEFAULT_TCP_RST_TIMEOUT); - } + cnat_timestamp_set_lifetime (index, CNAT_DEFAULT_TCP_RST_TIMEOUT); if (PREDICT_FALSE (tcp_syn (tcp) && tcp_ack (tcp))) - { - cnat_timestamp_set_lifetime (index, cm->tcp_max_age); - } + cnat_timestamp_set_lifetime (index, cm->tcp_max_age); } static_always_inline void -cnat_translation_icmp4_echo (ip4_header_t * ip4, icmp46_header_t * icmp, +cnat_translation_icmp4_echo (ip4_header_t *ip4, icmp46_header_t *icmp, ip4_address_t new_addr[VLIB_N_DIR], - u16 new_port[VLIB_N_DIR]) + u16 new_port[VLIB_N_DIR], u32 oflags) { ip_csum_t sum; u16 old_port; cnat_echo_header_t *echo = (cnat_echo_header_t *) (icmp + 1); - cnat_ip4_translate_l3 (ip4, new_addr); + cnat_ip4_translate_l3 (ip4, new_addr, oflags); old_port = echo->identifier; echo->identifier = new_port[VLIB_RX]; sum = icmp->checksum; - sum = ip_csum_update (sum, old_port, new_port[VLIB_RX], - ip4_header_t /* cheat */ , - length /* changed member */ ); + sum = + ip_csum_update (sum, old_port, new_port[VLIB_RX], udp_header_t, src_port); icmp->checksum = ip_csum_fold (sum); } static_always_inline void -cnat_translation_icmp4_error (ip4_header_t * outer_ip4, - icmp46_header_t * icmp, +cnat_translation_icmp4_error (ip4_header_t *outer_ip4, icmp46_header_t *icmp, ip4_address_t outer_new_addr[VLIB_N_DIR], - u16 outer_new_port[VLIB_N_DIR], - u8 snat_outer_ip) + u16 outer_new_port[VLIB_N_DIR], u8 snat_outer_ip, + u32 oflags) { ip4_address_t new_addr[VLIB_N_DIR]; ip4_address_t old_addr[VLIB_N_DIR]; @@ -327,18 +326,20 @@ cnat_translation_icmp4_error (ip4_header_t * outer_ip4, /* translate outer ip. */ if (!snat_outer_ip) outer_new_addr[VLIB_RX] = outer_ip4->src_address; - cnat_ip4_translate_l3 (outer_ip4, outer_new_addr); + cnat_ip4_translate_l3 (outer_ip4, outer_new_addr, oflags); if (ip4->protocol == IP_PROTOCOL_TCP) { inner_l4_old_sum = inner_l4_sum = tcp->checksum; - cnat_ip4_translate_l4 (ip4, udp, &inner_l4_sum, new_addr, new_port); + cnat_ip4_translate_l4 (ip4, udp, &inner_l4_sum, new_addr, new_port, + 0 /* flags */); tcp->checksum = ip_csum_fold (inner_l4_sum); } else if (ip4->protocol == IP_PROTOCOL_UDP) { inner_l4_old_sum = inner_l4_sum = udp->checksum; - cnat_ip4_translate_l4 (ip4, udp, &inner_l4_sum, new_addr, new_port); + cnat_ip4_translate_l4 (ip4, udp, &inner_l4_sum, new_addr, new_port, + 0 /* flags */); udp->checksum = ip_csum_fold (inner_l4_sum); } else @@ -351,37 +352,30 @@ cnat_translation_icmp4_error (ip4_header_t * outer_ip4, /* UDP/TCP Ports changed */ if (old_port[VLIB_TX] && new_port[VLIB_TX]) sum = ip_csum_update (sum, old_port[VLIB_TX], new_port[VLIB_TX], - ip4_header_t /* cheat */ , - length /* changed member */ ); + udp_header_t, dst_port); if (old_port[VLIB_RX] && new_port[VLIB_RX]) sum = ip_csum_update (sum, old_port[VLIB_RX], new_port[VLIB_RX], - ip4_header_t /* cheat */ , - length /* changed member */ ); - + udp_header_t, src_port); - cnat_ip4_translate_l3 (ip4, new_addr); + cnat_ip4_translate_l3 (ip4, new_addr, 0 /* oflags */); ip_csum_t new_ip_sum = ip4->checksum; /* IP checksum changed */ sum = ip_csum_update (sum, old_ip_sum, new_ip_sum, ip4_header_t, checksum); /* IP src/dst addr changed */ - if (new_addr[VLIB_TX].as_u32) - sum = - ip_csum_update (sum, old_addr[VLIB_TX].as_u32, new_addr[VLIB_TX].as_u32, - ip4_header_t, dst_address); + sum = ip_csum_update (sum, old_addr[VLIB_TX].as_u32, + new_addr[VLIB_TX].as_u32, ip4_header_t, dst_address); - if (new_addr[VLIB_RX].as_u32) - sum = - ip_csum_update (sum, old_addr[VLIB_RX].as_u32, new_addr[VLIB_RX].as_u32, - ip4_header_t, src_address); + sum = ip_csum_update (sum, old_addr[VLIB_RX].as_u32, + new_addr[VLIB_RX].as_u32, ip4_header_t, src_address); icmp->checksum = ip_csum_fold (sum); } static_always_inline void -cnat_translation_ip4 (const cnat_session_t * session, - ip4_header_t * ip4, udp_header_t * udp) +cnat_translation_ip4 (const cnat_session_t *session, ip4_header_t *ip4, + udp_header_t *udp, u32 oflags) { tcp_header_t *tcp = (tcp_header_t *) udp; ip4_address_t new_addr[VLIB_N_DIR]; @@ -395,17 +389,23 @@ cnat_translation_ip4 (const cnat_session_t * session, if (ip4->protocol == IP_PROTOCOL_TCP) { ip_csum_t sum = tcp->checksum; - cnat_ip4_translate_l4 (ip4, udp, &sum, new_addr, new_port); + cnat_ip4_translate_l4 (ip4, udp, &sum, new_addr, new_port, oflags); tcp->checksum = ip_csum_fold (sum); - cnat_ip4_translate_l3 (ip4, new_addr); + cnat_ip4_translate_l3 (ip4, new_addr, oflags); cnat_tcp_update_session_lifetime (tcp, session->value.cs_ts_index); } else if (ip4->protocol == IP_PROTOCOL_UDP) { ip_csum_t sum = udp->checksum; - cnat_ip4_translate_l4 (ip4, udp, &sum, new_addr, new_port); + cnat_ip4_translate_l4 (ip4, udp, &sum, new_addr, new_port, oflags); udp->checksum = ip_csum_fold (sum); - cnat_ip4_translate_l3 (ip4, new_addr); + cnat_ip4_translate_l3 (ip4, new_addr, oflags); + } + else if (ip4->protocol == IP_PROTOCOL_SCTP) + { + sctp_header_t *sctp = (sctp_header_t *) udp; + cnat_ip4_translate_sctp (ip4, sctp, new_port); + cnat_ip4_translate_l3 (ip4, new_addr, oflags); } else if (ip4->protocol == IP_PROTOCOL_ICMP) { @@ -417,74 +417,65 @@ cnat_translation_ip4 (const cnat_session_t * session, (ip4->src_address.as_u32 == session->key.cs_ip[VLIB_RX].ip4.as_u32); cnat_translation_icmp4_error (ip4, icmp, new_addr, new_port, - snat_outer_ip); + snat_outer_ip, oflags); } else if (icmp_type_is_echo (icmp->type)) - cnat_translation_icmp4_echo (ip4, icmp, new_addr, new_port); + cnat_translation_icmp4_echo (ip4, icmp, new_addr, new_port, oflags); } } static_always_inline void cnat_ip6_translate_l3 (ip6_header_t * ip6, ip6_address_t new_addr[VLIB_N_DIR]) { - if (has_ip6_address (&new_addr[VLIB_TX])) - ip6_address_copy (&ip6->dst_address, &new_addr[VLIB_TX]); - if (has_ip6_address (&new_addr[VLIB_RX])) - ip6_address_copy (&ip6->src_address, &new_addr[VLIB_RX]); + ip6_address_copy (&ip6->dst_address, &new_addr[VLIB_TX]); + ip6_address_copy (&ip6->src_address, &new_addr[VLIB_RX]); +} + +static_always_inline u16 +ip6_pseudo_header_cksum2 (ip6_header_t *ip6, ip6_address_t address[VLIB_N_DIR]) +{ + ip6_psh_t psh = { 0 }; + psh.src = address[VLIB_RX]; + psh.dst = address[VLIB_TX]; + psh.l4len = ip6->payload_length; + psh.proto = clib_host_to_net_u32 ((u32) ip6->protocol); + return ~(clib_ip_csum ((u8 *) &psh, sizeof (ip6_psh_t))); } static_always_inline void -cnat_ip6_translate_l4 (ip6_header_t * ip6, udp_header_t * udp, - ip_csum_t * sum, +cnat_ip6_translate_l4 (ip6_header_t *ip6, udp_header_t *udp, ip_csum_t *sum, ip6_address_t new_addr[VLIB_N_DIR], - u16 new_port[VLIB_N_DIR]) + u16 new_port[VLIB_N_DIR], u32 oflags) { u16 old_port[VLIB_N_DIR]; - ip6_address_t old_addr[VLIB_N_DIR]; + old_port[VLIB_TX] = udp->dst_port; + old_port[VLIB_RX] = udp->src_port; - /* Fastpath no checksum */ - if (PREDICT_TRUE (0 == *sum)) + udp->dst_port = new_port[VLIB_TX]; + udp->src_port = new_port[VLIB_RX]; + + if (oflags & + (VNET_BUFFER_OFFLOAD_F_TCP_CKSUM | VNET_BUFFER_OFFLOAD_F_UDP_CKSUM)) { - udp->dst_port = new_port[VLIB_TX]; - udp->src_port = new_port[VLIB_RX]; + *sum = ip6_pseudo_header_cksum2 (ip6, new_addr); return; } - old_port[VLIB_TX] = udp->dst_port; - old_port[VLIB_RX] = udp->src_port; - ip6_address_copy (&old_addr[VLIB_TX], &ip6->dst_address); - ip6_address_copy (&old_addr[VLIB_RX], &ip6->src_address); + *sum = ip_csum_add_even (*sum, new_addr[VLIB_TX].as_u64[0]); + *sum = ip_csum_add_even (*sum, new_addr[VLIB_TX].as_u64[1]); + *sum = ip_csum_sub_even (*sum, ip6->dst_address.as_u64[0]); + *sum = ip_csum_sub_even (*sum, ip6->dst_address.as_u64[1]); - if (has_ip6_address (&new_addr[VLIB_TX])) - { - *sum = ip_csum_add_even (*sum, new_addr[VLIB_TX].as_u64[0]); - *sum = ip_csum_add_even (*sum, new_addr[VLIB_TX].as_u64[1]); - *sum = ip_csum_sub_even (*sum, old_addr[VLIB_TX].as_u64[0]); - *sum = ip_csum_sub_even (*sum, old_addr[VLIB_TX].as_u64[1]); - } + *sum = ip_csum_add_even (*sum, new_addr[VLIB_RX].as_u64[0]); + *sum = ip_csum_add_even (*sum, new_addr[VLIB_RX].as_u64[1]); + *sum = ip_csum_sub_even (*sum, ip6->src_address.as_u64[0]); + *sum = ip_csum_sub_even (*sum, ip6->src_address.as_u64[1]); - if (new_port[VLIB_TX]) - { - udp->dst_port = new_port[VLIB_TX]; - *sum = ip_csum_update (*sum, old_port[VLIB_TX], new_port[VLIB_TX], - ip4_header_t /* cheat */ , - length /* changed member */ ); - } - if (has_ip6_address (&new_addr[VLIB_RX])) - { - *sum = ip_csum_add_even (*sum, new_addr[VLIB_RX].as_u64[0]); - *sum = ip_csum_add_even (*sum, new_addr[VLIB_RX].as_u64[1]); - *sum = ip_csum_sub_even (*sum, old_addr[VLIB_RX].as_u64[0]); - *sum = ip_csum_sub_even (*sum, old_addr[VLIB_RX].as_u64[1]); - } + *sum = ip_csum_update (*sum, old_port[VLIB_TX], new_port[VLIB_TX], + udp_header_t, dst_port); - if (new_port[VLIB_RX]) - { - udp->src_port = new_port[VLIB_RX]; - *sum = ip_csum_update (*sum, old_port[VLIB_RX], new_port[VLIB_RX], - ip4_header_t /* cheat */ , - length /* changed member */ ); - } + *sum = ip_csum_update (*sum, old_port[VLIB_RX], new_port[VLIB_RX], + udp_header_t, src_port); } static_always_inline void @@ -503,26 +494,20 @@ cnat_translation_icmp6_echo (ip6_header_t * ip6, icmp46_header_t * icmp, sum = icmp->checksum; cnat_ip6_translate_l3 (ip6, new_addr); - if (has_ip6_address (&new_addr[VLIB_TX])) - { - sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[0]); - sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[1]); - sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[0]); - sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[1]); - } - if (has_ip6_address (&new_addr[VLIB_RX])) - { - sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[0]); - sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[1]); - sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[0]); - sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[1]); - } + sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[0]); + sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[1]); + sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[0]); + sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[1]); + + sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[0]); + sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[1]); + sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[0]); + sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[1]); echo->identifier = new_port[VLIB_RX]; - sum = ip_csum_update (sum, old_port, new_port[VLIB_RX], - ip4_header_t /* cheat */ , - length /* changed member */ ); + sum = + ip_csum_update (sum, old_port, new_port[VLIB_RX], udp_header_t, src_port); icmp->checksum = ip_csum_fold (sum); } @@ -566,79 +551,64 @@ cnat_translation_icmp6_error (ip6_header_t * outer_ip6, if (!snat_outer_ip) ip6_address_copy (&outer_new_addr[VLIB_RX], &outer_ip6->src_address); cnat_ip6_translate_l3 (outer_ip6, outer_new_addr); - if (has_ip6_address (&outer_new_addr[VLIB_TX])) - { - sum = ip_csum_add_even (sum, outer_new_addr[VLIB_TX].as_u64[0]); - sum = ip_csum_add_even (sum, outer_new_addr[VLIB_TX].as_u64[1]); - sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_TX].as_u64[0]); - sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_TX].as_u64[1]); - } - if (has_ip6_address (&outer_new_addr[VLIB_RX])) - { - sum = ip_csum_add_even (sum, outer_new_addr[VLIB_RX].as_u64[0]); - sum = ip_csum_add_even (sum, outer_new_addr[VLIB_RX].as_u64[1]); - sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_RX].as_u64[0]); - sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_RX].as_u64[1]); - } + sum = ip_csum_add_even (sum, outer_new_addr[VLIB_TX].as_u64[0]); + sum = ip_csum_add_even (sum, outer_new_addr[VLIB_TX].as_u64[1]); + sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_TX].as_u64[0]); + sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_TX].as_u64[1]); + + sum = ip_csum_add_even (sum, outer_new_addr[VLIB_RX].as_u64[0]); + sum = ip_csum_add_even (sum, outer_new_addr[VLIB_RX].as_u64[1]); + sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_RX].as_u64[0]); + sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_RX].as_u64[1]); /* Translate inner TCP / UDP */ if (ip6->protocol == IP_PROTOCOL_TCP) { inner_l4_old_sum = inner_l4_sum = tcp->checksum; - cnat_ip6_translate_l4 (ip6, udp, &inner_l4_sum, new_addr, new_port); + cnat_ip6_translate_l4 (ip6, udp, &inner_l4_sum, new_addr, new_port, + 0 /* oflags */); tcp->checksum = ip_csum_fold (inner_l4_sum); } else if (ip6->protocol == IP_PROTOCOL_UDP) { inner_l4_old_sum = inner_l4_sum = udp->checksum; - cnat_ip6_translate_l4 (ip6, udp, &inner_l4_sum, new_addr, new_port); + cnat_ip6_translate_l4 (ip6, udp, &inner_l4_sum, new_addr, new_port, + 0 /* oflags */); udp->checksum = ip_csum_fold (inner_l4_sum); } else return; /* UDP/TCP checksum changed */ - sum = ip_csum_update (sum, inner_l4_old_sum, inner_l4_sum, - ip4_header_t /* cheat */ , + sum = ip_csum_update (sum, inner_l4_old_sum, inner_l4_sum, ip4_header_t, checksum); /* UDP/TCP Ports changed */ - if (old_port[VLIB_TX] && new_port[VLIB_TX]) - sum = ip_csum_update (sum, old_port[VLIB_TX], new_port[VLIB_TX], - ip4_header_t /* cheat */ , - length /* changed member */ ); - - if (old_port[VLIB_RX] && new_port[VLIB_RX]) - sum = ip_csum_update (sum, old_port[VLIB_RX], new_port[VLIB_RX], - ip4_header_t /* cheat */ , - length /* changed member */ ); + sum = ip_csum_update (sum, old_port[VLIB_TX], new_port[VLIB_TX], + udp_header_t, dst_port); + sum = ip_csum_update (sum, old_port[VLIB_RX], new_port[VLIB_RX], + udp_header_t, src_port); cnat_ip6_translate_l3 (ip6, new_addr); /* IP src/dst addr changed */ - if (has_ip6_address (&new_addr[VLIB_TX])) - { - sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[0]); - sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[1]); - sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[0]); - sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[1]); - } + sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[0]); + sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[1]); + sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[0]); + sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[1]); - if (has_ip6_address (&new_addr[VLIB_RX])) - { - sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[0]); - sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[1]); - sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[0]); - sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[1]); - } + sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[0]); + sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[1]); + sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[0]); + sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[1]); icmp->checksum = ip_csum_fold (sum); } static_always_inline void -cnat_translation_ip6 (const cnat_session_t * session, - ip6_header_t * ip6, udp_header_t * udp) +cnat_translation_ip6 (const cnat_session_t *session, ip6_header_t *ip6, + udp_header_t *udp, u32 oflags) { tcp_header_t *tcp = (tcp_header_t *) udp; ip6_address_t new_addr[VLIB_N_DIR]; @@ -652,7 +622,7 @@ cnat_translation_ip6 (const cnat_session_t * session, if (ip6->protocol == IP_PROTOCOL_TCP) { ip_csum_t sum = tcp->checksum; - cnat_ip6_translate_l4 (ip6, udp, &sum, new_addr, new_port); + cnat_ip6_translate_l4 (ip6, udp, &sum, new_addr, new_port, oflags); tcp->checksum = ip_csum_fold (sum); cnat_ip6_translate_l3 (ip6, new_addr); cnat_tcp_update_session_lifetime (tcp, session->value.cs_ts_index); @@ -660,7 +630,7 @@ cnat_translation_ip6 (const cnat_session_t * session, else if (ip6->protocol == IP_PROTOCOL_UDP) { ip_csum_t sum = udp->checksum; - cnat_ip6_translate_l4 (ip6, udp, &sum, new_addr, new_port); + cnat_ip6_translate_l4 (ip6, udp, &sum, new_addr, new_port, oflags); udp->checksum = ip_csum_fold (sum); cnat_ip6_translate_l3 (ip6, new_addr); } @@ -743,6 +713,18 @@ cnat_session_make_key (vlib_buffer_t *b, ip_address_family_t af, session->key.cs_port[VLIB_RX] = udp->src_port; session->key.cs_port[VLIB_TX] = udp->dst_port; } + else if (ip4->protocol == IP_PROTOCOL_SCTP) + { + sctp_header_t *sctp; + sctp = (sctp_header_t *) (ip4 + 1); + ip46_address_set_ip4 (&session->key.cs_ip[VLIB_TX], + &ip4->dst_address); + ip46_address_set_ip4 (&session->key.cs_ip[VLIB_RX], + &ip4->src_address); + session->key.cs_proto = ip4->protocol; + session->key.cs_port[VLIB_RX] = sctp->src_port; + session->key.cs_port[VLIB_TX] = sctp->dst_port; + } else goto error; } @@ -837,20 +819,74 @@ cnat_load_balance (const cnat_translation_t *ct, ip_address_family_t af, * rsession_location is the location the (return) session will be * matched at */ + +static_always_inline void +cnat_session_create (cnat_session_t *session, cnat_node_ctx_t *ctx) +{ + cnat_bihash_kv_t *bkey = (cnat_bihash_kv_t *) session; + + session->value.cs_ts_index = cnat_timestamp_new (ctx->now); + cnat_bihash_add_del (&cnat_session_db, bkey, 1); +} + static_always_inline void -cnat_session_create (cnat_session_t *session, cnat_node_ctx_t *ctx, - cnat_session_location_t rsession_location, - u8 rsession_flags) +cnat_rsession_create (cnat_session_t *session, cnat_node_ctx_t *ctx, + cnat_session_location_t rsession_location, + cnat_session_flag_t rsession_flags) { cnat_client_t *cc; cnat_bihash_kv_t rkey; cnat_session_t *rsession = (cnat_session_t *) & rkey; cnat_bihash_kv_t *bkey = (cnat_bihash_kv_t *) session; - cnat_bihash_kv_t rvalue; - int rv; + int rv, n_retries = 0; + static u32 sport_seed = 0; - session->value.cs_ts_index = cnat_timestamp_new (ctx->now); - cnat_bihash_add_del (&cnat_session_db, bkey, 1); + cnat_timestamp_inc_refcnt (session->value.cs_ts_index); + + /* First create the return session */ + ip46_address_copy (&rsession->key.cs_ip[VLIB_RX], + &session->value.cs_ip[VLIB_TX]); + ip46_address_copy (&rsession->key.cs_ip[VLIB_TX], + &session->value.cs_ip[VLIB_RX]); + rsession->key.cs_proto = session->key.cs_proto; + rsession->key.cs_loc = rsession_location; + rsession->key.__cs_pad = 0; + rsession->key.cs_af = ctx->af; + rsession->key.cs_port[VLIB_RX] = session->value.cs_port[VLIB_TX]; + rsession->key.cs_port[VLIB_TX] = session->value.cs_port[VLIB_RX]; + + ip46_address_copy (&rsession->value.cs_ip[VLIB_RX], + &session->key.cs_ip[VLIB_TX]); + ip46_address_copy (&rsession->value.cs_ip[VLIB_TX], + &session->key.cs_ip[VLIB_RX]); + rsession->value.cs_ts_index = session->value.cs_ts_index; + rsession->value.cs_lbi = INDEX_INVALID; + rsession->value.flags = rsession_flags | CNAT_SESSION_IS_RETURN; + rsession->value.cs_port[VLIB_TX] = session->key.cs_port[VLIB_RX]; + rsession->value.cs_port[VLIB_RX] = session->key.cs_port[VLIB_TX]; + +retry_add_ression: + rv = cnat_bihash_add_del (&cnat_session_db, &rkey, + 2 /* add but don't overwrite */); + if (rv) + { + if (!(rsession_flags & CNAT_SESSION_RETRY_SNAT)) + return; + + /* return session add failed pick an new random src port */ + rsession->value.cs_port[VLIB_TX] = session->key.cs_port[VLIB_RX] = + random_u32 (&sport_seed); + if (n_retries++ < 100) + goto retry_add_ression; + else + { + clib_warning ("Could not find a free port after 100 tries"); + /* translate this packet, but don't create state */ + return; + } + } + + cnat_bihash_add_del (&cnat_session_db, bkey, 1 /* add */); if (!(rsession_flags & CNAT_SESSION_FLAG_NO_CLIENT)) { @@ -894,39 +930,6 @@ cnat_session_create (cnat_session_t *session, cnat_node_ctx_t *ctx, } } - /* create the reverse flow key */ - ip46_address_copy (&rsession->key.cs_ip[VLIB_RX], - &session->value.cs_ip[VLIB_TX]); - ip46_address_copy (&rsession->key.cs_ip[VLIB_TX], - &session->value.cs_ip[VLIB_RX]); - rsession->key.cs_proto = session->key.cs_proto; - rsession->key.cs_loc = rsession_location; - rsession->key.__cs_pad = 0; - rsession->key.cs_af = ctx->af; - rsession->key.cs_port[VLIB_RX] = session->value.cs_port[VLIB_TX]; - rsession->key.cs_port[VLIB_TX] = session->value.cs_port[VLIB_RX]; - - /* First search for existing reverse session */ - rv = cnat_bihash_search_i2 (&cnat_session_db, &rkey, &rvalue); - if (!rv) - { - /* Reverse session already exists - cleanup before creating for refcnts */ - cnat_session_t *found_rsession = (cnat_session_t *) & rvalue; - cnat_session_free (found_rsession); - } - /* add the reverse flow */ - ip46_address_copy (&rsession->value.cs_ip[VLIB_RX], - &session->key.cs_ip[VLIB_TX]); - ip46_address_copy (&rsession->value.cs_ip[VLIB_TX], - &session->key.cs_ip[VLIB_RX]); - rsession->value.cs_ts_index = session->value.cs_ts_index; - rsession->value.cs_lbi = INDEX_INVALID; - rsession->value.flags = rsession_flags | CNAT_SESSION_IS_RETURN; - rsession->value.cs_port[VLIB_TX] = session->key.cs_port[VLIB_RX]; - rsession->value.cs_port[VLIB_RX] = session->key.cs_port[VLIB_TX]; - - cnat_bihash_add_del (&cnat_session_db, &rkey, 1); } always_inline uword diff --git a/src/plugins/cnat/cnat_node_feature.c b/src/plugins/cnat/cnat_node_feature.c index aced4cd0a15..9b2c0c2fe06 100644 --- a/src/plugins/cnat/cnat_node_feature.c +++ b/src/plugins/cnat/cnat_node_feature.c @@ -143,7 +143,10 @@ cnat_input_feature_fn (vlib_main_t *vm, vlib_node_runtime_t *node, /* refcnt session in current client */ cnat_client_cnt_session (cc); - cnat_session_create (session, ctx, CNAT_LOCATION_OUTPUT, rsession_flags); + cnat_session_create (session, ctx); + if (!(ct->flags & CNAT_TR_FLAG_NO_RETURN_SESSION)) + cnat_rsession_create (session, ctx, CNAT_LOCATION_OUTPUT, + rsession_flags); trace_flags |= CNAT_TRACE_SESSION_CREATED; } @@ -156,9 +159,9 @@ cnat_input_feature_fn (vlib_main_t *vm, vlib_node_runtime_t *node, } if (AF_IP4 == ctx->af) - cnat_translation_ip4 (session, ip4, udp0); + cnat_translation_ip4 (session, ip4, udp0, vnet_buffer (b)->oflags); else - cnat_translation_ip6 (session, ip6, udp0); + cnat_translation_ip6 (session, ip6, udp0, vnet_buffer (b)->oflags); if (NULL != ct) { @@ -320,14 +323,17 @@ cnat_output_feature_fn (vlib_main_t *vm, vlib_node_runtime_t *node, CNAT_SESSION_FLAG_NO_CLIENT | CNAT_SESSION_FLAG_ALLOC_PORT; trace_flags |= CNAT_TRACE_SESSION_CREATED; - cnat_session_create (session, ctx, CNAT_LOCATION_INPUT, - CNAT_SESSION_FLAG_NO_CLIENT); + + cnat_session_create (session, ctx); + cnat_rsession_create (session, ctx, CNAT_LOCATION_INPUT, + CNAT_SESSION_FLAG_NO_CLIENT | + CNAT_SESSION_RETRY_SNAT); } if (AF_IP4 == ctx->af) - cnat_translation_ip4 (session, ip4, udp0); + cnat_translation_ip4 (session, ip4, udp0, vnet_buffer (b)->oflags); else - cnat_translation_ip6 (session, ip6, udp0); + cnat_translation_ip6 (session, ip6, udp0, vnet_buffer (b)->oflags); trace: if (PREDICT_FALSE (ctx->do_trace)) diff --git a/src/plugins/cnat/cnat_node_snat.c b/src/plugins/cnat/cnat_node_snat.c index 9212d67ead6..57530eb397d 100644 --- a/src/plugins/cnat/cnat_node_snat.c +++ b/src/plugins/cnat/cnat_node_snat.c @@ -129,15 +129,15 @@ cnat_snat_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, CNAT_SESSION_FLAG_NO_CLIENT | CNAT_SESSION_FLAG_ALLOC_PORT; trace_flags |= CNAT_TRACE_SESSION_CREATED; - cnat_session_create (session, ctx, CNAT_LOCATION_FIB, - CNAT_SESSION_FLAG_HAS_SNAT); + cnat_session_create (session, ctx); + cnat_rsession_create (session, ctx, CNAT_LOCATION_FIB, + CNAT_SESSION_FLAG_HAS_SNAT); } - if (AF_IP4 == ctx->af) - cnat_translation_ip4 (session, ip4, udp0); + cnat_translation_ip4 (session, ip4, udp0, vnet_buffer (b)->oflags); else - cnat_translation_ip6 (session, ip6, udp0); + cnat_translation_ip6 (session, ip6, udp0, vnet_buffer (b)->oflags); trace: if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED)) diff --git a/src/plugins/cnat/cnat_node_vip.c b/src/plugins/cnat/cnat_node_vip.c index f166bd4f194..d320746c5fa 100644 --- a/src/plugins/cnat/cnat_node_vip.c +++ b/src/plugins/cnat/cnat_node_vip.c @@ -168,7 +168,9 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b, /* refcnt session in current client */ cnat_client_cnt_session (cc); - cnat_session_create (session, ctx, CNAT_LOCATION_FIB, rsession_flags); + cnat_session_create (session, ctx); + if (!(ct->flags & CNAT_TR_FLAG_NO_RETURN_SESSION)) + cnat_rsession_create (session, ctx, CNAT_LOCATION_FIB, rsession_flags); trace_flags |= CNAT_TRACE_SESSION_CREATED; next0 = ct->ct_lb.dpoi_next_node; @@ -176,9 +178,9 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b, } if (AF_IP4 == ctx->af) - cnat_translation_ip4 (session, ip4, udp0); + cnat_translation_ip4 (session, ip4, udp0, vnet_buffer (b)->oflags); else - cnat_translation_ip6 (session, ip6, udp0); + cnat_translation_ip6 (session, ip6, udp0, vnet_buffer (b)->oflags); if (NULL != ct) { diff --git a/src/plugins/cnat/cnat_scanner.c b/src/plugins/cnat/cnat_scanner.c index b3591f7e8b0..2f982711581 100644 --- a/src/plugins/cnat/cnat_scanner.c +++ b/src/plugins/cnat/cnat_scanner.c @@ -14,6 +14,7 @@ */ #include <cnat/cnat_session.h> +#include <vlibmemory/api.h> #include <cnat/cnat_client.h> static uword diff --git a/src/plugins/cnat/cnat_session.c b/src/plugins/cnat/cnat_session.c index 216d2575c37..0f1cd43f501 100644 --- a/src/plugins/cnat/cnat_session.c +++ b/src/plugins/cnat/cnat_session.c @@ -94,7 +94,8 @@ format_cnat_session (u8 * s, va_list * args) cnat_session_t *sess = va_arg (*args, cnat_session_t *); CLIB_UNUSED (int verbose) = va_arg (*args, int); f64 ts = 0; - if (!pool_is_free_index (cnat_timestamps, sess->value.cs_ts_index)) + + if (!cnat_ts_is_free_index (sess->value.cs_ts_index)) ts = cnat_timestamp_exp (sess->value.cs_ts_index); s = format ( @@ -172,15 +173,43 @@ cnat_session_purge (void) return (0); } +void +cnat_reverse_session_free (cnat_session_t *session) +{ + cnat_bihash_kv_t bkey, bvalue; + cnat_session_t *rsession = (cnat_session_t *) &bkey; + int rv; + + ip46_address_copy (&rsession->key.cs_ip[VLIB_RX], + &session->value.cs_ip[VLIB_TX]); + ip46_address_copy (&rsession->key.cs_ip[VLIB_TX], + &session->value.cs_ip[VLIB_RX]); + rsession->key.cs_proto = session->key.cs_proto; + rsession->key.cs_loc = session->key.cs_loc == CNAT_LOCATION_OUTPUT ? + CNAT_LOCATION_INPUT : + CNAT_LOCATION_OUTPUT; + rsession->key.__cs_pad = 0; + rsession->key.cs_af = session->key.cs_af; + rsession->key.cs_port[VLIB_RX] = session->value.cs_port[VLIB_TX]; + rsession->key.cs_port[VLIB_TX] = session->value.cs_port[VLIB_RX]; + + rv = cnat_bihash_search_i2 (&cnat_session_db, &bkey, &bvalue); + if (!rv) + { + /* other session is in bihash */ + cnat_session_t *rsession = (cnat_session_t *) &bvalue; + cnat_session_free (rsession); + } +} + u64 cnat_session_scan (vlib_main_t * vm, f64 start_time, int i) { BVT (clib_bihash) * h = &cnat_session_db; int j, k; - /* Don't scan the l2 fib if it hasn't been instantiated yet */ if (alloc_arena (h) == 0) - return 0.0; + return 0; for ( /* caller saves starting point */ ; i < h->nbuckets; i++) { @@ -210,7 +239,7 @@ cnat_session_scan (vlib_main_t * vm, f64 start_time, int i) { for (k = 0; k < BIHASH_KVP_PER_PAGE; k++) { - if (v->kvp[k].key[0] == ~0ULL && v->kvp[k].value[0] == ~0ULL) + if (BV (clib_bihash_is_free) (&v->kvp[k])) continue; cnat_session_t *session = (cnat_session_t *) & v->kvp[k]; @@ -219,6 +248,9 @@ cnat_session_scan (vlib_main_t * vm, f64 start_time, int i) cnat_timestamp_exp (session->value.cs_ts_index)) { /* age it */ + cnat_reverse_session_free (session); + /* this should be last as deleting the session memset it to + * 0xff */ cnat_session_free (session); /* @@ -248,6 +280,12 @@ cnat_session_init (vlib_main_t * vm) cm->session_hash_memory); BV (clib_bihash_set_kvp_format_fn) (&cnat_session_db, format_cnat_session); + cnat_timestamps.next_empty_pool_idx = 0; + clib_bitmap_alloc (cnat_timestamps.ts_free, 1 << CNAT_TS_MPOOL_BITS); + clib_bitmap_set_region (cnat_timestamps.ts_free, 0, 1, + 1 << CNAT_TS_MPOOL_BITS); + clib_spinlock_init (&cnat_timestamps.ts_lock); + return (NULL); } @@ -258,21 +296,38 @@ cnat_timestamp_show (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { cnat_timestamp_t *ts; - clib_rwlock_reader_lock (&cnat_main.ts_lock); - pool_foreach (ts, cnat_timestamps) + int ts_cnt = 0, cnt; + u8 verbose = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { - vlib_cli_output (vm, "[%d] last_seen:%f lifetime:%u ref:%u", - ts - cnat_timestamps, ts->last_seen, ts->lifetime, - ts->refcnt); + if (unformat (input, "verbose")) + verbose = 1; + else + return (clib_error_return (0, "unknown input '%U'", + format_unformat_error, input)); + } + + for (int i = 0; i < cnat_timestamps.next_empty_pool_idx; i++) + { + cnt = pool_elts (cnat_timestamps.ts_pools[i]); + ts_cnt += cnt; + vlib_cli_output (vm, "-- Pool %d [%d/%d]", i, cnt, + pool_header (cnat_timestamps.ts_pools[i])->max_elts); + if (!verbose) + continue; + pool_foreach (ts, cnat_timestamps.ts_pools[i]) + vlib_cli_output (vm, "[%d] last_seen:%f lifetime:%u ref:%u", + ts - cnat_timestamps.ts_pools[i], ts->last_seen, + ts->lifetime, ts->refcnt); } - clib_rwlock_reader_unlock (&cnat_main.ts_lock); + vlib_cli_output (vm, "Total timestamps %d", ts_cnt); return (NULL); } VLIB_CLI_COMMAND (cnat_timestamp_show_cmd, static) = { .path = "show cnat timestamp", .function = cnat_timestamp_show, - .short_help = "show cnat timestamp", + .short_help = "show cnat timestamp [verbose]", .is_mp_safe = 1, }; diff --git a/src/plugins/cnat/cnat_session.h b/src/plugins/cnat/cnat_session.h index 072bb10f96f..a0a28c9a818 100644 --- a/src/plugins/cnat/cnat_session.h +++ b/src/plugins/cnat/cnat_session.h @@ -129,6 +129,11 @@ typedef enum cnat_session_flag_t_ /* Debug flag marking return sessions */ CNAT_SESSION_IS_RETURN = (1 << 4), + + /** On conflicts when adding the return session, try to sNAT the + * forward session, and dNAT the return session with a random port */ + CNAT_SESSION_RETRY_SNAT = (1 << 5), + } cnat_session_flag_t; typedef enum cnat_session_location_t_ diff --git a/src/plugins/cnat/cnat_snat_policy.c b/src/plugins/cnat/cnat_snat_policy.c index d59156f34c8..cd9bfef492a 100644 --- a/src/plugins/cnat/cnat_snat_policy.c +++ b/src/plugins/cnat/cnat_snat_policy.c @@ -29,6 +29,8 @@ unformat_cnat_snat_interface_map_type (unformat_input_t *input, va_list *args) *a = CNAT_SNAT_IF_MAP_INCLUDE_V6; else if (unformat (input, "k8s")) *a = CNAT_SNAT_IF_MAP_INCLUDE_POD; + else if (unformat (input, "host")) + *a = CNAT_SNAT_IF_MAP_INCLUDE_HOST; else return 0; return 1; @@ -49,6 +51,9 @@ format_cnat_snat_interface_map_type (u8 *s, va_list *args) case CNAT_SNAT_IF_MAP_INCLUDE_POD: s = format (s, "k8s pod"); break; + case CNAT_SNAT_IF_MAP_INCLUDE_HOST: + s = format (s, "k8s host"); + break; default: s = format (s, "(unknown)"); break; @@ -108,7 +113,7 @@ cnat_snat_policy_add_del_if_command_fn (vlib_main_t *vm, vnet_main_t *vnm = vnet_get_main (); int is_add = 1; u32 sw_if_index = ~0; - u32 table; + u32 table = 0; int rv; while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) @@ -296,6 +301,14 @@ cnat_snat_policy_k8s (vlib_buffer_t *b, cnat_session_t *session) u32 in_if = vnet_buffer (b)->sw_if_index[VLIB_RX]; u32 out_if = vnet_buffer (b)->sw_if_index[VLIB_TX]; + /* we should never snat traffic that we punt to the host, pass traffic as it + * is for us */ + if (clib_bitmap_get (cpm->interface_maps[CNAT_SNAT_IF_MAP_INCLUDE_HOST], + out_if)) + { + return 0; + } + /* source nat for outgoing connections */ if (cnat_snat_policy_interface_enabled (in_if, af)) if (cnat_search_snat_prefix (dst_addr, af)) diff --git a/src/plugins/cnat/cnat_snat_policy.h b/src/plugins/cnat/cnat_snat_policy.h index 987ae494e16..61c2382602f 100644 --- a/src/plugins/cnat/cnat_snat_policy.h +++ b/src/plugins/cnat/cnat_snat_policy.h @@ -45,6 +45,9 @@ typedef enum cnat_snat_interface_map_type_t_ CNAT_SNAT_IF_MAP_INCLUDE_V4 = AF_IP4, CNAT_SNAT_IF_MAP_INCLUDE_V6 = AF_IP6, CNAT_SNAT_IF_MAP_INCLUDE_POD, + /* CNAT_SNAT_IF_MAP_INCLUDE_HOST is used for interfaces used for punt, + replicating uplink */ + CNAT_SNAT_IF_MAP_INCLUDE_HOST, CNAT_N_SNAT_IF_MAP, } cnat_snat_interface_map_type_t; diff --git a/src/plugins/cnat/cnat_src_policy.c b/src/plugins/cnat/cnat_src_policy.c index cac24b7742c..8f3f3375148 100644 --- a/src/plugins/cnat/cnat_src_policy.c +++ b/src/plugins/cnat/cnat_src_policy.c @@ -59,8 +59,8 @@ cnat_vip_default_source_policy (vlib_main_t * vm, u16 sport; sport = udp0->src_port; /* Allocate a port only if asked and if we actually sNATed */ - if ((ct->flags & CNAT_TRANSLATION_FLAG_ALLOCATE_PORT) - && (*rsession_flags & CNAT_SESSION_FLAG_HAS_SNAT)) + if ((ct->flags & CNAT_TR_FLAG_ALLOCATE_PORT) && + (*rsession_flags & CNAT_SESSION_FLAG_HAS_SNAT)) { sport = 0; /* force allocation */ session->value.flags |= CNAT_SESSION_FLAG_ALLOC_PORT; diff --git a/src/plugins/cnat/cnat_translation.c b/src/plugins/cnat/cnat_translation.c index 049809a8684..513cedf0446 100644 --- a/src/plugins/cnat/cnat_translation.c +++ b/src/plugins/cnat/cnat_translation.c @@ -18,8 +18,10 @@ #include <vnet/fib/fib_entry_track.h> #include <vnet/dpo/load_balance.h> #include <vnet/dpo/drop_dpo.h> +#include <vnet/dpo/dpo.h> #include <cnat/cnat_translation.h> +#include <cnat/cnat_maglev.h> #include <cnat/cnat_session.h> #include <cnat/cnat_client.h> @@ -82,6 +84,7 @@ cnat_tracker_release (cnat_ep_trk_t * trk) /* We only track fully resolved endpoints */ if (!(trk->ct_flags & CNAT_TRK_ACTIVE)) return; + dpo_reset (&trk->ct_dpo); // undo fib_entry_contribute_forwarding fib_entry_untrack (trk->ct_fei, trk->ct_sibling); } @@ -200,110 +203,7 @@ cnat_remove_translation_from_db (index_t cci, cnat_endpoint_t * vip, clib_bihash_add_del_8_8 (&cnat_translation_db, &bkey, 0); } -typedef struct -{ - cnat_ep_trk_t *trk; - u32 index; - u32 offset; - u32 skip; -} cnat_maglev_entry_t; -static int -cnat_maglev_entry_compare (void *_a, void *_b) -{ - cnat_ep_trk_t *a = ((cnat_maglev_entry_t *) _a)->trk; - cnat_ep_trk_t *b = ((cnat_maglev_entry_t *) _b)->trk; - int rv = 0; - if ((rv = - ip_address_cmp (&a->ct_ep[VLIB_TX].ce_ip, &b->ct_ep[VLIB_TX].ce_ip))) - return rv; - if ((rv = a->ct_ep[VLIB_TX].ce_port - a->ct_ep[VLIB_TX].ce_port)) - return rv; - if ((rv = - ip_address_cmp (&a->ct_ep[VLIB_RX].ce_ip, &b->ct_ep[VLIB_RX].ce_ip))) - return rv; - if ((rv = a->ct_ep[VLIB_RX].ce_port - a->ct_ep[VLIB_RX].ce_port)) - return rv; - return 0; -} - -static void -cnat_translation_init_maglev (cnat_translation_t *ct) -{ - cnat_maglev_entry_t *backends = NULL, *bk; - cnat_main_t *cm = &cnat_main; - u32 done = 0; - cnat_ep_trk_t *trk; - int ep_idx = 0; - - vec_foreach (trk, ct->ct_active_paths) - { - cnat_maglev_entry_t bk; - u32 h1, h2; - - if (AF_IP4 == ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip)) - { - u32 a, b, c; - a = ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32; - b = (u64) trk->ct_ep[VLIB_TX].ce_port << 16 | - (u64) trk->ct_ep[VLIB_RX].ce_port; - c = ip_addr_v4 (&trk->ct_ep[VLIB_RX].ce_ip).data_u32; - hash_v3_mix32 (a, b, c); - hash_v3_finalize32 (a, b, c); - h1 = c; - h2 = b; - } - else - { - u64 a, b, c; - a = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[0] ^ - ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[1]; - b = (u64) trk->ct_ep[VLIB_TX].ce_port << 16 | - (u64) trk->ct_ep[VLIB_RX].ce_port; - c = ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[0] ^ - ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[1]; - hash_mix64 (a, b, c); - h1 = c; - h2 = b; - } - - bk.offset = h1 % cm->maglev_len; - bk.skip = h2 % (cm->maglev_len - 1) + 1; - bk.index = ep_idx++; - bk.trk = trk; - vec_add1 (backends, bk); - } - - if (0 == ep_idx) - return; - - vec_sort_with_function (backends, cnat_maglev_entry_compare); - - /* Don't free if previous vector exists, just zero */ - vec_validate (ct->lb_maglev, cm->maglev_len); - vec_set (ct->lb_maglev, -1); - - while (1) - { - vec_foreach (bk, backends) - { - u32 next = 0; - u32 c = (bk->offset + next * bk->skip) % cm->maglev_len; - while (ct->lb_maglev[c] != (u32) -1) - { - next++; - c = (bk->offset + next * bk->skip) % cm->maglev_len; - } - ct->lb_maglev[c] = bk->index; - done++; - if (done == cm->maglev_len) - goto finished; - } - } - -finished: - vec_free (backends); -} static void cnat_translation_stack (cnat_translation_t * ct) @@ -323,8 +223,11 @@ cnat_translation_stack (cnat_translation_t * ct) if (trk->ct_flags & CNAT_TRK_ACTIVE) vec_add1 (ct->ct_active_paths, *trk); + flow_hash_config_t fhc = IP_FLOW_HASH_DEFAULT; + if (ct->fhc != 0) + fhc = ct->fhc; lbi = load_balance_create (vec_len (ct->ct_active_paths), - fib_proto_to_dpo (fproto), IP_FLOW_HASH_DEFAULT); + fib_proto_to_dpo (fproto), fhc); ep_idx = 0; vec_foreach (trk, ct->ct_active_paths) @@ -335,7 +238,7 @@ cnat_translation_stack (cnat_translation_t * ct) dpo_set (&ct->ct_lb, DPO_LOAD_BALANCE, dproto, lbi); dpo_stack (cnat_client_dpo, dproto, &ct->ct_lb, &ct->ct_lb); - ct->flags |= CNAT_TRANSLATION_STACKED; + ct->flags |= CNAT_TR_FLAG_STACKED; } int @@ -365,8 +268,9 @@ cnat_translation_delete (u32 id) u32 cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto, cnat_endpoint_tuple_t *paths, u8 flags, - cnat_lb_type_t lb_type) + cnat_lb_type_t lb_type, flow_hash_config_t fhc) { + const dpo_id_t tmp = DPO_INVALID; cnat_endpoint_tuple_t *path; const cnat_client_t *cc; cnat_translation_t *ct; @@ -398,6 +302,7 @@ cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto, ct->ct_cci = cci; ct->index = ct - cnat_translation_pool; ct->lb_type = lb_type; + ct->fhc = fhc; cnat_add_translation_to_db (cci, vip, proto, ct->index); cnat_client_translation_added (cci); @@ -417,7 +322,7 @@ cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto, } vec_reset_length (ct->ct_paths); - ct->flags &= ~CNAT_TRANSLATION_STACKED; + ct->flags &= ~CNAT_TR_FLAG_STACKED; u64 path_idx = 0; vec_foreach (path, paths) @@ -438,6 +343,7 @@ cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto, clib_memcpy (&trk->ct_ep[VLIB_RX], &path->src_ep, sizeof (trk->ct_ep[VLIB_RX])); trk->ct_flags = path->ep_flags; + trk->ct_dpo = tmp; cnat_tracker_track (ct->index, trk); } @@ -486,6 +392,11 @@ format_cnat_translation (u8 * s, va_list * args) format_ip_protocol, ct->ct_proto); s = format (s, "lb:%U ", format_cnat_lb_type, ct->lb_type); + if ((ct->fhc == 0) || (ct->fhc == IP_FLOW_HASH_DEFAULT)) + s = format (s, "fhc:0x%x(default)", IP_FLOW_HASH_DEFAULT); + else + s = format (s, "fhc:0x%x", ct->fhc); + vec_foreach (ck, ct->ct_paths) s = format (s, "\n%U", format_cnat_ep_trk, ck, 2); @@ -615,7 +526,7 @@ cnat_translation_back_walk_notify (fib_node_t * node, /* If we have more than FIB_PATH_LIST_POPULAR paths * we might get called during path tracking * (cnat_tracker_track) */ - if (!(ct->flags & CNAT_TRANSLATION_STACKED)) + if (!(ct->flags & CNAT_TR_FLAG_STACKED)) return (FIB_NODE_BACK_WALK_CONTINUE); cnat_translation_stack (ct); @@ -678,8 +589,9 @@ cnat_translation_cli_add_del (vlib_main_t * vm, } } + flow_hash_config_t fhc = 0; if (INDEX_INVALID == del_index) - cnat_translation_update (&vip, proto, paths, flags, lb_type); + cnat_translation_update (&vip, proto, paths, flags, lb_type, fhc); else cnat_translation_delete (del_index); @@ -764,11 +676,11 @@ cnat_if_addr_add_del_backend_cb (addr_resolution_t * ar, ep->ce_flags |= CNAT_EP_FLAG_RESOLVED; } - ct->flags &= ~CNAT_TRANSLATION_STACKED; + ct->flags &= ~CNAT_TR_FLAG_STACKED; cnat_tracker_track (ar->cti, trk); cnat_translation_stack (ct); - ct->flags |= CNAT_TRANSLATION_STACKED; + ct->flags |= CNAT_TR_FLAG_STACKED; } static void @@ -825,7 +737,7 @@ cnat_translation_init (vlib_main_t * vm) ip6_main_t *i6m = &ip6_main; cnat_main_t *cm = &cnat_main; cnat_translation_fib_node_type = - fib_node_register_new_type (&cnat_translation_vft); + fib_node_register_new_type ("cnat-translation", &cnat_translation_vft); clib_bihash_init_8_8 (&cnat_translation_db, "CNat translation DB", cm->translation_hash_buckets, diff --git a/src/plugins/cnat/cnat_translation.h b/src/plugins/cnat/cnat_translation.h index 97b0c908b42..9bb3455d9fe 100644 --- a/src/plugins/cnat/cnat_translation.h +++ b/src/plugins/cnat/cnat_translation.h @@ -60,12 +60,14 @@ typedef struct cnat_ep_trk_t_ typedef enum cnat_translation_flag_t_ { /* Do allocate a source port */ - CNAT_TRANSLATION_FLAG_ALLOCATE_PORT = (1 << 0), + CNAT_TR_FLAG_ALLOCATE_PORT = (1 << 0), /* Has this translation been satcked ? * this allow not being called twice when * with more then FIB_PATH_LIST_POPULAR backends */ - CNAT_TRANSLATION_STACKED = (1 << 1), -} cnat_translation_flag_t; + CNAT_TR_FLAG_STACKED = (1 << 1), + /* Do not create a return session */ + CNAT_TR_FLAG_NO_RETURN_SESSION = (1 << 2), +} __clib_packed cnat_translation_flag_t; typedef enum { @@ -76,11 +78,11 @@ typedef enum CNAT_ADDR_N_RESOLUTIONS, } cnat_addr_resol_type_t; -typedef enum __attribute__ ((__packed__)) +typedef enum { CNAT_LB_DEFAULT, CNAT_LB_MAGLEV, -} cnat_lb_type_t; +} __clib_packed cnat_lb_type_t; /** * Entry used to account for a translation's backend @@ -160,13 +162,18 @@ typedef struct cnat_translation_t_ /** * Translation flags */ - u8 flags; + cnat_translation_flag_t flags; /** * Type of load balancing */ cnat_lb_type_t lb_type; + /** + * Type of flow hash config + */ + flow_hash_config_t fhc; + union { u32 *lb_maglev; @@ -189,7 +196,8 @@ extern u8 *format_cnat_translation (u8 * s, va_list * args); extern u32 cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t ip_proto, cnat_endpoint_tuple_t *backends, u8 flags, - cnat_lb_type_t lb_type); + cnat_lb_type_t lb_type, + flow_hash_config_t fhc); /** * Delete a translation diff --git a/src/plugins/cnat/cnat_types.c b/src/plugins/cnat/cnat_types.c index 9b164c6069d..084a03da968 100644 --- a/src/plugins/cnat/cnat_types.c +++ b/src/plugins/cnat/cnat_types.c @@ -16,8 +16,7 @@ #include <cnat/cnat_types.h> cnat_main_t cnat_main; -fib_source_t cnat_fib_source; -cnat_timestamp_t *cnat_timestamps; +cnat_timestamp_mpool_t cnat_timestamps; char *cnat_error_strings[] = { #define cnat_error(n,s) s, @@ -152,19 +151,6 @@ format_cnat_endpoint (u8 * s, va_list * args) return (s); } -static clib_error_t * -cnat_types_init (vlib_main_t * vm) -{ - cnat_fib_source = fib_source_allocate ("cnat", - CNAT_FIB_SOURCE_PRIORITY, - FIB_SOURCE_BH_SIMPLE); - - - clib_rwlock_init (&cnat_main.ts_lock); - - return (NULL); -} - void cnat_enable_disable_scanner (cnat_scanner_cmd_t event_type) { @@ -191,6 +177,8 @@ cnat_config (vlib_main_t * vm, unformat_input_t * input) cm->session_hash_buckets = CNAT_DEFAULT_SESSION_BUCKETS; cm->translation_hash_memory = CNAT_DEFAULT_TRANSLATION_MEMORY; cm->translation_hash_buckets = CNAT_DEFAULT_TRANSLATION_BUCKETS; + cm->client_hash_memory = CNAT_DEFAULT_CLIENT_MEMORY; + cm->client_hash_buckets = CNAT_DEFAULT_CLIENT_BUCKETS; cm->snat_hash_memory = CNAT_DEFAULT_SNAT_MEMORY; cm->snat_hash_buckets = CNAT_DEFAULT_SNAT_BUCKETS; cm->snat_if_map_length = CNAT_DEFAULT_SNAT_IF_MAP_LEN; @@ -215,6 +203,12 @@ cnat_config (vlib_main_t * vm, unformat_input_t * input) else if (unformat (input, "translation-db-memory %U", unformat_memory_size, &cm->translation_hash_memory)) ; + else if (unformat (input, "client-db-buckets %u", + &cm->client_hash_buckets)) + ; + else if (unformat (input, "client-db-memory %U", unformat_memory_size, + &cm->client_hash_memory)) + ; else if (unformat (input, "snat-db-buckets %u", &cm->snat_hash_buckets)) ; else if (unformat (input, "snat-if-map-len %u", &cm->snat_if_map_length)) @@ -250,7 +244,6 @@ cnat_get_main () } VLIB_EARLY_CONFIG_FUNCTION (cnat_config, "cnat"); -VLIB_INIT_FUNCTION (cnat_types_init); /* * fd.io coding-style-patch-verification: ON diff --git a/src/plugins/cnat/cnat_types.h b/src/plugins/cnat/cnat_types.h index c3ec74c345f..d229d21adae 100644 --- a/src/plugins/cnat/cnat_types.h +++ b/src/plugins/cnat/cnat_types.h @@ -36,12 +36,14 @@ #define CNAT_DEFAULT_SESSION_BUCKETS 1024 #define CNAT_DEFAULT_TRANSLATION_BUCKETS 1024 +#define CNAT_DEFAULT_CLIENT_BUCKETS 1024 #define CNAT_DEFAULT_SNAT_BUCKETS 1024 #define CNAT_DEFAULT_SNAT_IF_MAP_LEN 4096 #define CNAT_DEFAULT_SESSION_MEMORY (1 << 20) #define CNAT_DEFAULT_TRANSLATION_MEMORY (256 << 10) -#define CNAT_DEFAULT_SNAT_MEMORY (64 << 20) +#define CNAT_DEFAULT_CLIENT_MEMORY (256 << 10) +#define CNAT_DEFAULT_SNAT_MEMORY (64 << 10) /* Should be prime >~ 100 * numBackends */ #define CNAT_DEFAULT_MAGLEV_LEN 1009 @@ -50,11 +52,24 @@ * from fib_source.h */ #define CNAT_FIB_SOURCE_PRIORITY 0x02 -/* Initial refcnt for timestamps (2 : session & rsession) */ -#define CNAT_TIMESTAMP_INIT_REFCNT 2 +/* Initial number of timestamps for a session + * this will be incremented when adding the reverse + * session in cnat_rsession_create */ +#define CNAT_TIMESTAMP_INIT_REFCNT 1 #define MIN_SRC_PORT ((u16) 0xC000) +typedef struct +{ + /* Source and destination port. */ + u16 src_port, dst_port; + + /* Random value to distinguish connections. */ + u32 verification_tag; + + u32 checksum; +} sctp_header_t; + typedef enum cnat_trk_flag_t_ { /* Endpoint is active (static or dhcp resolved) */ @@ -62,6 +77,8 @@ typedef enum cnat_trk_flag_t_ /* Don't translate this endpoint, but still * forward. Used by maglev for DSR */ CNAT_TRK_FLAG_NO_NAT = (1 << 1), + /* */ + CNAT_TRK_FLAG_TEST_DISABLED = (1 << 7), } cnat_trk_flag_t; typedef enum @@ -105,6 +122,12 @@ typedef struct cnat_main_ /* Number of buckets of the translation bihash */ u32 translation_hash_buckets; + /* Memory size of the client bihash */ + uword client_hash_memory; + + /* Number of buckets of the client bihash */ + u32 client_hash_buckets; + /* Memory size of the source NAT prefix bihash */ uword snat_hash_memory; @@ -125,9 +148,6 @@ typedef struct cnat_main_ /* delay in seconds between two scans of session/clients tables */ f64 scanner_timeout; - /* Lock for the timestamp pool */ - clib_rwlock_t ts_lock; - /* Index of the scanner process node */ uword scanner_node_index; @@ -152,6 +172,23 @@ typedef struct cnat_timestamp_t_ u16 refcnt; } cnat_timestamp_t; +/* Create the first pool with 1 << CNAT_TS_BASE_SIZE elts */ +#define CNAT_TS_BASE_SIZE (8) +/* reserve the top CNAT_TS_MPOOL_BITS bits for finding the pool */ +#define CNAT_TS_MPOOL_BITS (6) + +typedef struct cnat_timestamp_mpool_t_ +{ + /* Increasing fixed size pools of timestamps */ + cnat_timestamp_t *ts_pools[1 << CNAT_TS_MPOOL_BITS]; + /* Bitmap of pools with free space */ + uword *ts_free; + /* Index of next pool to init */ + u8 next_empty_pool_idx; + /* ts creation lock */ + clib_spinlock_t ts_lock; +} cnat_timestamp_mpool_t; + typedef struct cnat_node_ctx_ { f64 now; @@ -165,8 +202,7 @@ extern u8 *format_cnat_endpoint (u8 * s, va_list * args); extern uword unformat_cnat_ep_tuple (unformat_input_t * input, va_list * args); extern uword unformat_cnat_ep (unformat_input_t * input, va_list * args); -extern cnat_timestamp_t *cnat_timestamps; -extern fib_source_t cnat_fib_source; +extern cnat_timestamp_mpool_t cnat_timestamps; extern cnat_main_t cnat_main; extern char *cnat_error_strings[]; |