aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins/cnat
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/cnat')
-rw-r--r--src/plugins/cnat/CMakeLists.txt1
-rw-r--r--src/plugins/cnat/FEATURE.yaml2
-rw-r--r--src/plugins/cnat/cnat.api8
-rw-r--r--src/plugins/cnat/cnat.rst42
-rw-r--r--src/plugins/cnat/cnat_api.c8
-rw-r--r--src/plugins/cnat/cnat_bihash.h9
-rw-r--r--src/plugins/cnat/cnat_client.c76
-rw-r--r--src/plugins/cnat/cnat_client.h41
-rw-r--r--src/plugins/cnat/cnat_inline.h104
-rw-r--r--src/plugins/cnat/cnat_maglev.c379
-rw-r--r--src/plugins/cnat/cnat_maglev.h21
-rw-r--r--src/plugins/cnat/cnat_node.h485
-rw-r--r--src/plugins/cnat/cnat_node_feature.c20
-rw-r--r--src/plugins/cnat/cnat_node_snat.c10
-rw-r--r--src/plugins/cnat/cnat_node_vip.c8
-rw-r--r--src/plugins/cnat/cnat_scanner.c1
-rw-r--r--src/plugins/cnat/cnat_session.c77
-rw-r--r--src/plugins/cnat/cnat_session.h5
-rw-r--r--src/plugins/cnat/cnat_snat_policy.c15
-rw-r--r--src/plugins/cnat/cnat_snat_policy.h3
-rw-r--r--src/plugins/cnat/cnat_src_policy.c4
-rw-r--r--src/plugins/cnat/cnat_translation.c136
-rw-r--r--src/plugins/cnat/cnat_translation.h22
-rw-r--r--src/plugins/cnat/cnat_types.c25
-rw-r--r--src/plugins/cnat/cnat_types.h52
25 files changed, 1031 insertions, 523 deletions
diff --git a/src/plugins/cnat/CMakeLists.txt b/src/plugins/cnat/CMakeLists.txt
index cfb55661a78..e99bf056a35 100644
--- a/src/plugins/cnat/CMakeLists.txt
+++ b/src/plugins/cnat/CMakeLists.txt
@@ -24,6 +24,7 @@ add_vpp_plugin(cnat
cnat_types.c
cnat_snat_policy.c
cnat_src_policy.c
+ cnat_maglev.c
API_FILES
cnat.api
diff --git a/src/plugins/cnat/FEATURE.yaml b/src/plugins/cnat/FEATURE.yaml
index 9deda2e94cc..880d713b63f 100644
--- a/src/plugins/cnat/FEATURE.yaml
+++ b/src/plugins/cnat/FEATURE.yaml
@@ -9,7 +9,7 @@ description: "This plugin is intended to complement the VPP's plugin_nat for
Cloud use-cases. It allows for source/destination address/port
translation based on multiple criterias. It is intended to be modular
enough so that one could write a use-case optimised translation function
- without having to deal with actually re-writing packets or maintining
+ without having to deal with actually re-writing packets or maintaining
sessions.
This plugin supports multithreading. Workers share a unique bihash where
sessions are stored."
diff --git a/src/plugins/cnat/cnat.api b/src/plugins/cnat/cnat.api
index e253084e74e..e6ad37dd6eb 100644
--- a/src/plugins/cnat/cnat.api
+++ b/src/plugins/cnat/cnat.api
@@ -1,6 +1,6 @@
/* Hey Emacs use -*- mode: C -*- */
/*
- * Copyright (c) 2016 Cisco and/or its affiliates.
+ * Copyright (c) 2023 Cisco and/or its affiliates.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
@@ -19,14 +19,16 @@
used to control the ABF plugin
*/
-option version = "0.2.0";
+option version = "0.3.0";
import "vnet/ip/ip_types.api";
import "vnet/fib/fib_types.api";
import "vnet/interface_types.api";
+import "vnet/ip/ip.api";
enum cnat_translation_flags:u8
{
CNAT_TRANSLATION_ALLOC_PORT = 1,
+ CNAT_TRANSLATION_NO_RETURN_SESSION = 4,
};
enum cnat_endpoint_tuple_flags:u8
@@ -70,6 +72,7 @@ typedef cnat_translation
u8 flags;
vl_api_cnat_lb_type_t lb_type;
u32 n_paths;
+ vl_api_ip_flow_hash_config_v2_t flow_hash_config;
vl_api_cnat_endpoint_tuple_t paths[n_paths];
};
@@ -172,6 +175,7 @@ enum cnat_snat_policy_table:u8
CNAT_POLICY_INCLUDE_V4 = 0,
CNAT_POLICY_INCLUDE_V6 = 1,
CNAT_POLICY_POD = 2,
+ CNAT_POLICY_HOST = 3,
};
autoreply define cnat_snat_policy_add_del_if
diff --git a/src/plugins/cnat/cnat.rst b/src/plugins/cnat/cnat.rst
index 8781f405a23..b0426f35373 100644
--- a/src/plugins/cnat/cnat.rst
+++ b/src/plugins/cnat/cnat.rst
@@ -9,7 +9,7 @@ Overview
________
This plugin covers specific NAT use-cases that come mostly
-from the container networking world. On the contraty of the
+from the container networking world. On the contrary of the
NAT concepts used for e.g. a home gateway, there is no notion
of 'outside' and 'inside'. We handle Virtual (or Real) IPs and
translations of the packets destined to them
@@ -33,9 +33,9 @@ that will store the packet rewrite to do and the one to undo
until the flow is reset or a timeout is reached
A ``session`` is a fully resolved 9-tuple of ``src_ip, src_port, dest_ip, dest_port, proto``
-to match incoming packets, and their new attributes ``new_src_ip, new_src_port, new_dest_ip, new_dest_port``. It allows for ``backend`` stickyness and a fast-path for established connections.
+to match incoming packets, and their new attributes ``new_src_ip, new_src_port, new_dest_ip, new_dest_port``. It allows for ``backend`` stickiness and a fast-path for established connections.
-These ``sessions`` expire after 30s for regular ``sessions`` and 1h for estabished
+These ``sessions`` expire after 30s for regular ``sessions`` and 1h for established
TCP connections. These can be changed in vpp's configuration file
.. code-block:: console
@@ -64,7 +64,7 @@ assigned to an interface
If ``30.0.0.2`` is the address of an interface, we can use the following
-to do the same translation, and additionnaly change the source.
+to do the same translation, and additionally change the source.
address with ``1.2.3.4``
.. code-block:: console
@@ -75,17 +75,17 @@ To show existing translations and sessions you can use
.. code-block:: console
- cnat show session verbose
- cant show translation
+ show cnat session verbose
+ show cnat translation
SourceNATing outgoing traffic
-----------------------------
-A independant part of the plugin allows changing the source address
+A independent part of the plugin allows changing the source address
of outgoing traffic on a per-interface basis.
-In the following example, all traffic comming from ``tap0`` and NOT
+In the following example, all traffic coming from ``tap0`` and NOT
going to ``20.0.0.0/24`` will be source NAT-ed with ``30.0.0.1``.
On the way back the translation will be undone.
@@ -94,10 +94,18 @@ address assigned to an interface)
.. code-block:: console
- cnat snat with 30.0.0.1
- cnat snat exclude 20.0.0.0/24
+ set cnat snat-policy addr 30.0.0.1
+ set cnat snat-policy if-pfx
+ set cnat snat-policy if table include-v4 tap0
+ set cnat snat-policy prefix 20.0.0.0/24
set interface feature tap0 cnat-snat-ip4 arc ip4-unicast
+To show the enforced snat policies:
+
+.. code-block:: console
+
+ show cnat snat-policy
+
Other parameters
----------------
@@ -105,7 +113,7 @@ In vpp's startup file, you can also configure the bihash sizes for
* the translation bihash ``(proto, port) -> translation``
* the session bihash ``src_ip, src_port, dest_ip, dest_port, proto -> new_src_ip, new_src_port, new_dest_ip, new_dest_port``
-* the snat bihash for searching ``snat exclude`` prefixes
+* the snat bihash for searching ``snat-policy`` excluded prefixes
.. code-block:: console
@@ -126,19 +134,19 @@ This plugin is built to be extensible. For now two NAT types are defined, ``cnat
* Session lookup : ``rv`` will be set to ``0`` if a session was found
* Translation primitives ``cnat_translation_ip4`` based on sessions
* A session creation primitive ``cnat_session_create``
+* A reverse session creation primitive ``cnat_rsession_create``
-Creating a session will also create a reverse session (for matching return traffic),
-and call a NAT node back that will perform the translation.
+Creating a session will also create reverse session matching return traffic unless told otherwise by setting ``CNAT_TR_FLAG_NO_RETURN_SESSION`` on the translation. This will call the NAT nodes on the return flow and perform the inverse translation.
Known limitations
_________________
-This plugin is still under developpment, it lacks the following features :
+This plugin is still under development, it lacks the following features :
* Load balancing doesn't support parametric probabilities
-* VRFs aren't supported. All rules apply to fib table 0 only
+* VRFs are not supported, all rules apply regardless of the FIB table.
* Programmatic session handling (deletion, lifetime updates) aren't supported
-* ICMP is not yet supported
-* Traffic matching is only done based on ``(proto, dst_addr, dst_port)`` source matching isn't supported
+* translations (i.e. rewriting the destination address) only match on the three
+tuple ``(proto, dst_addr, dst_port)`` other matches are not supported
* Statistics & session tracking are still rudimentary.
diff --git a/src/plugins/cnat/cnat_api.c b/src/plugins/cnat/cnat_api.c
index ea4b3aeaaef..c578e303499 100644
--- a/src/plugins/cnat/cnat_api.c
+++ b/src/plugins/cnat/cnat_api.c
@@ -81,7 +81,7 @@ cnat_endpoint_encode (const cnat_endpoint_t * in,
if (in->ce_flags & CNAT_EP_FLAG_RESOLVED)
ip_address_encode2 (&in->ce_ip, &out->addr);
else
- clib_memset ((void *) &in->ce_ip, 0, sizeof (in->ce_ip));
+ clib_memset (&out->addr, 0, sizeof (out->addr));
}
static void
@@ -97,6 +97,7 @@ vl_api_cnat_translation_update_t_handler (vl_api_cnat_translation_update_t
int rv = 0;
u32 pi, n_paths;
cnat_lb_type_t lb_type;
+ flow_hash_config_t flow_hash_config = 0;
rv = ip_proto_decode (mp->translation.ip_proto, &ip_proto);
@@ -123,7 +124,10 @@ vl_api_cnat_translation_update_t_handler (vl_api_cnat_translation_update_t
flags |= CNAT_FLAG_EXCLUSIVE;
lb_type = (cnat_lb_type_t) mp->translation.lb_type;
- id = cnat_translation_update (&vip, ip_proto, paths, flags, lb_type);
+ flow_hash_config = (flow_hash_config_t) clib_net_to_host_u32 (
+ mp->translation.flow_hash_config);
+ id = cnat_translation_update (&vip, ip_proto, paths, flags, lb_type,
+ flow_hash_config);
vec_free (paths);
diff --git a/src/plugins/cnat/cnat_bihash.h b/src/plugins/cnat/cnat_bihash.h
index c488e61a07d..75099f6bfdb 100644
--- a/src/plugins/cnat/cnat_bihash.h
+++ b/src/plugins/cnat/cnat_bihash.h
@@ -44,11 +44,16 @@ typedef struct
u64 value[7];
} clib_bihash_kv_40_56_t;
+static inline void
+clib_bihash_mark_free_40_56 (clib_bihash_kv_40_56_t *v)
+{
+ v->value[0] = 0xFEEDFACE8BADF00DULL;
+}
+
static inline int
clib_bihash_is_free_40_56 (const clib_bihash_kv_40_56_t *v)
{
- /* Free values are clib_memset to 0xff, check a bit... */
- if (v->key[0] == ~0ULL && v->value[0] == ~0ULL)
+ if (v->value[0] == 0xFEEDFACE8BADF00DULL)
return 1;
return 0;
}
diff --git a/src/plugins/cnat/cnat_client.c b/src/plugins/cnat/cnat_client.c
index b8fcb9add64..a28896a4c12 100644
--- a/src/plugins/cnat/cnat_client.c
+++ b/src/plugins/cnat/cnat_client.c
@@ -20,10 +20,9 @@
#include <cnat/cnat_translation.h>
cnat_client_t *cnat_client_pool;
-
cnat_client_db_t cnat_client_db;
-
dpo_type_t cnat_client_dpo;
+fib_source_t cnat_fib_source;
static_always_inline u8
cnat_client_is_clone (cnat_client_t * cc)
@@ -34,10 +33,42 @@ cnat_client_is_clone (cnat_client_t * cc)
static void
cnat_client_db_remove (cnat_client_t * cc)
{
+ clib_bihash_kv_16_8_t bkey;
+ if (ip_addr_version (&cc->cc_ip) == AF_IP4)
+ {
+ bkey.key[0] = ip_addr_v4 (&cc->cc_ip).as_u32;
+ bkey.key[1] = 0;
+ }
+ else
+ {
+ bkey.key[0] = ip_addr_v6 (&cc->cc_ip).as_u64[0];
+ bkey.key[1] = ip_addr_v6 (&cc->cc_ip).as_u64[1];
+ }
+
+ clib_bihash_add_del_16_8 (&cnat_client_db.cc_ip_id_hash, &bkey, 0 /* del */);
+}
+
+static void
+cnat_client_db_add (cnat_client_t *cc)
+{
+ index_t cci;
+
+ cci = cc - cnat_client_pool;
+
+ clib_bihash_kv_16_8_t bkey;
+ bkey.value = cci;
if (ip_addr_version (&cc->cc_ip) == AF_IP4)
- hash_unset (cnat_client_db.crd_cip4, ip_addr_v4 (&cc->cc_ip).as_u32);
+ {
+ bkey.key[0] = ip_addr_v4 (&cc->cc_ip).as_u32;
+ bkey.key[1] = 0;
+ }
else
- hash_unset_mem_free (&cnat_client_db.crd_cip6, &ip_addr_v6 (&cc->cc_ip));
+ {
+ bkey.key[0] = ip_addr_v6 (&cc->cc_ip).as_u64[0];
+ bkey.key[1] = ip_addr_v6 (&cc->cc_ip).as_u64[1];
+ }
+
+ clib_bihash_add_del_16_8 (&cnat_client_db.cc_ip_id_hash, &bkey, 1 /* add */);
}
static void
@@ -118,21 +149,6 @@ cnat_client_translation_deleted (index_t cci)
cnat_client_destroy (cc);
}
-static void
-cnat_client_db_add (cnat_client_t * cc)
-{
- index_t cci;
-
- cci = cc - cnat_client_pool;
-
- if (ip_addr_version (&cc->cc_ip) == AF_IP4)
- hash_set (cnat_client_db.crd_cip4, ip_addr_v4 (&cc->cc_ip).as_u32, cci);
- else
- hash_set_mem_alloc (&cnat_client_db.crd_cip6,
- &ip_addr_v6 (&cc->cc_ip), cci);
-}
-
-
index_t
cnat_client_add (const ip_address_t * ip, u8 flags)
{
@@ -228,12 +244,6 @@ int
cnat_client_purge (void)
{
int rv = 0, rrv = 0;
- if ((rv = hash_elts (cnat_client_db.crd_cip6)))
- clib_warning ("len(crd_cip6) isnt 0 but %d", rv);
- rrv |= rv;
- if ((rv = hash_elts (cnat_client_db.crd_cip4)))
- clib_warning ("len(crd_cip4) isnt 0 but %d", rv);
- rrv |= rv;
if ((rv = pool_elts (cnat_client_pool)))
clib_warning ("len(cnat_client_pool) isnt 0 but %d", rv);
rrv |= rv;
@@ -251,9 +261,9 @@ format_cnat_client (u8 * s, va_list * args)
cnat_client_t *cc = pool_elt_at_index (cnat_client_pool, cci);
- s = format (s, "[%d] cnat-client:[%U] tr:%d sess:%d", cci,
- format_ip_address, &cc->cc_ip,
- cc->tr_refcnt, cc->session_refcnt);
+ s = format (s, "[%d] cnat-client:[%U] tr:%d sess:%d locks:%u", cci,
+ format_ip_address, &cc->cc_ip, cc->tr_refcnt, cc->session_refcnt,
+ cc->cc_locks);
if (cc->flags & CNAT_FLAG_EXCLUSIVE)
s = format (s, " exclusive");
@@ -291,7 +301,6 @@ cnat_client_show (vlib_main_t * vm,
vlib_cli_output(vm, "%U", format_cnat_client, cci, 0);
vlib_cli_output (vm, "%d clients", pool_elts (cnat_client_pool));
- vlib_cli_output (vm, "%d timestamps", pool_elts (cnat_timestamps));
}
else
{
@@ -371,12 +380,15 @@ const static dpo_vft_t cnat_client_dpo_vft = {
static clib_error_t *
cnat_client_init (vlib_main_t * vm)
{
+ cnat_main_t *cm = &cnat_main;
cnat_client_dpo = dpo_register_new_type (&cnat_client_dpo_vft,
cnat_client_dpo_nodes);
- cnat_client_db.crd_cip6 = hash_create_mem (0,
- sizeof (ip6_address_t),
- sizeof (uword));
+ clib_bihash_init_16_8 (&cnat_client_db.cc_ip_id_hash, "CNat client DB",
+ cm->client_hash_buckets, cm->client_hash_memory);
+
+ cnat_fib_source = fib_source_allocate ("cnat", CNAT_FIB_SOURCE_PRIORITY,
+ FIB_SOURCE_BH_SIMPLE);
clib_spinlock_init (&cnat_client_db.throttle_lock);
cnat_client_db.throttle_mem =
diff --git a/src/plugins/cnat/cnat_client.h b/src/plugins/cnat/cnat_client.h
index d6e3631d868..4dc6b754b2f 100644
--- a/src/plugins/cnat/cnat_client.h
+++ b/src/plugins/cnat/cnat_client.h
@@ -17,6 +17,7 @@
#define __CNAT_CLIENT_H__
#include <cnat/cnat_types.h>
+#include <vppinfra/bihash_16_8.h>
/**
* A client is a representation of an IP address behind the NAT.
@@ -85,8 +86,6 @@ extern void cnat_client_free_by_ip (ip46_address_t * addr, u8 af);
extern cnat_client_t *cnat_client_pool;
extern dpo_type_t cnat_client_dpo;
-#define CC_INDEX_INVALID ((u32)(~0))
-
static_always_inline cnat_client_t *
cnat_client_get (index_t i)
{
@@ -132,8 +131,7 @@ extern void cnat_client_throttle_pool_process ();
*/
typedef struct cnat_client_db_t_
{
- uword *crd_cip4;
- uword *crd_cip6;
+ clib_bihash_16_8_t cc_ip_id_hash;
/* Pool of addresses that have been throttled
and need to be refcounted before calling
cnat_client_free_by_ip */
@@ -149,27 +147,15 @@ extern cnat_client_db_t cnat_client_db;
static_always_inline cnat_client_t *
cnat_client_ip4_find (const ip4_address_t * ip)
{
- uword *p;
-
- p = hash_get (cnat_client_db.crd_cip4, ip->as_u32);
-
- if (p)
- return (pool_elt_at_index (cnat_client_pool, p[0]));
-
- return (NULL);
-}
-
-static_always_inline u32
-cnat_client_ip4_find_index (const ip4_address_t * ip)
-{
- uword *p;
+ clib_bihash_kv_16_8_t bkey, bval;
- p = hash_get (cnat_client_db.crd_cip4, ip->as_u32);
+ bkey.key[0] = ip->as_u32;
+ bkey.key[1] = 0;
- if (p)
- return p[0];
+ if (clib_bihash_search_16_8 (&cnat_client_db.cc_ip_id_hash, &bkey, &bval))
+ return (NULL);
- return -1;
+ return (pool_elt_at_index (cnat_client_pool, bval.value));
}
/**
@@ -178,14 +164,15 @@ cnat_client_ip4_find_index (const ip4_address_t * ip)
static_always_inline cnat_client_t *
cnat_client_ip6_find (const ip6_address_t * ip)
{
- uword *p;
+ clib_bihash_kv_16_8_t bkey, bval;
- p = hash_get_mem (cnat_client_db.crd_cip6, ip);
+ bkey.key[0] = ip->as_u64[0];
+ bkey.key[1] = ip->as_u64[1];
- if (p)
- return (pool_elt_at_index (cnat_client_pool, p[0]));
+ if (clib_bihash_search_16_8 (&cnat_client_db.cc_ip_id_hash, &bkey, &bval))
+ return (NULL);
- return (NULL);
+ return (pool_elt_at_index (cnat_client_pool, bval.value));
}
/**
diff --git a/src/plugins/cnat/cnat_inline.h b/src/plugins/cnat/cnat_inline.h
index 5a55ecbf3c0..2986b3497a9 100644
--- a/src/plugins/cnat/cnat_inline.h
+++ b/src/plugins/cnat/cnat_inline.h
@@ -19,72 +19,122 @@
#include <cnat/cnat_types.h>
+always_inline int
+cnat_ts_is_free_index (u32 index)
+{
+ u32 pidx = index >> (32 - CNAT_TS_MPOOL_BITS);
+ index = index & (0xffffffff >> CNAT_TS_MPOOL_BITS);
+ return pool_is_free_index (cnat_timestamps.ts_pools[pidx], index);
+}
+
+always_inline cnat_timestamp_t *
+cnat_timestamp_get (u32 index)
+{
+ /* 6 top bits for choosing pool */
+ u32 pidx = index >> (32 - CNAT_TS_MPOOL_BITS);
+ index = index & (0xffffffff >> CNAT_TS_MPOOL_BITS);
+ return pool_elt_at_index (cnat_timestamps.ts_pools[pidx], index);
+}
+
+always_inline cnat_timestamp_t *
+cnat_timestamp_get_if_valid (u32 index)
+{
+ /* 6 top bits for choosing pool */
+ u32 pidx = index >> (32 - CNAT_TS_MPOOL_BITS);
+ index = index & (0xffffffff >> CNAT_TS_MPOOL_BITS);
+ if (pidx >= cnat_timestamps.next_empty_pool_idx)
+ return (NULL);
+ if (pool_is_free_index (cnat_timestamps.ts_pools[pidx], index))
+ return (NULL);
+ return pool_elt_at_index (cnat_timestamps.ts_pools[pidx], index);
+}
+
+always_inline index_t
+cnat_timestamp_alloc ()
+{
+ cnat_timestamp_t *ts;
+ u32 index, pool_sz;
+ uword pidx;
+
+ clib_spinlock_lock (&cnat_timestamps.ts_lock);
+ pidx = clib_bitmap_first_set (cnat_timestamps.ts_free);
+ pool_sz = 1 << (CNAT_TS_BASE_SIZE + pidx);
+ ASSERT (pidx <= cnat_timestamps.next_empty_pool_idx);
+ if (pidx == cnat_timestamps.next_empty_pool_idx)
+ pool_init_fixed (
+ cnat_timestamps.ts_pools[cnat_timestamps.next_empty_pool_idx++],
+ pool_sz);
+ pool_get (cnat_timestamps.ts_pools[pidx], ts);
+ if (pool_elts (cnat_timestamps.ts_pools[pidx]) == pool_sz)
+ clib_bitmap_set (cnat_timestamps.ts_free, pidx, 0);
+ clib_spinlock_unlock (&cnat_timestamps.ts_lock);
+
+ index = (u32) pidx << (32 - CNAT_TS_MPOOL_BITS);
+ return index | (ts - cnat_timestamps.ts_pools[pidx]);
+}
+
+always_inline void
+cnat_timestamp_destroy (u32 index)
+{
+ u32 pidx = index >> (32 - CNAT_TS_MPOOL_BITS);
+ index = index & (0xffffffff >> CNAT_TS_MPOOL_BITS);
+ clib_spinlock_lock (&cnat_timestamps.ts_lock);
+ pool_put_index (cnat_timestamps.ts_pools[pidx], index);
+ clib_bitmap_set (cnat_timestamps.ts_free, pidx, 1);
+ clib_spinlock_unlock (&cnat_timestamps.ts_lock);
+}
+
always_inline u32
cnat_timestamp_new (f64 t)
{
- u32 index;
- cnat_timestamp_t *ts;
- clib_rwlock_writer_lock (&cnat_main.ts_lock);
- pool_get (cnat_timestamps, ts);
+ index_t index = cnat_timestamp_alloc ();
+ cnat_timestamp_t *ts = cnat_timestamp_get (index);
ts->last_seen = t;
ts->lifetime = cnat_main.session_max_age;
ts->refcnt = CNAT_TIMESTAMP_INIT_REFCNT;
- index = ts - cnat_timestamps;
- clib_rwlock_writer_unlock (&cnat_main.ts_lock);
return index;
}
always_inline void
cnat_timestamp_inc_refcnt (u32 index)
{
- clib_rwlock_reader_lock (&cnat_main.ts_lock);
- cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index);
- ts->refcnt++;
- clib_rwlock_reader_unlock (&cnat_main.ts_lock);
+ cnat_timestamp_t *ts = cnat_timestamp_get (index);
+ clib_atomic_add_fetch (&ts->refcnt, 1);
}
always_inline void
cnat_timestamp_update (u32 index, f64 t)
{
- clib_rwlock_reader_lock (&cnat_main.ts_lock);
- cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index);
+ cnat_timestamp_t *ts = cnat_timestamp_get (index);
ts->last_seen = t;
- clib_rwlock_reader_unlock (&cnat_main.ts_lock);
}
always_inline void
cnat_timestamp_set_lifetime (u32 index, u16 lifetime)
{
- clib_rwlock_reader_lock (&cnat_main.ts_lock);
- cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index);
+ cnat_timestamp_t *ts = cnat_timestamp_get (index);
ts->lifetime = lifetime;
- clib_rwlock_reader_unlock (&cnat_main.ts_lock);
}
always_inline f64
cnat_timestamp_exp (u32 index)
{
f64 t;
- if (INDEX_INVALID == index)
+ cnat_timestamp_t *ts = cnat_timestamp_get_if_valid (index);
+ if (NULL == ts)
return -1;
- clib_rwlock_reader_lock (&cnat_main.ts_lock);
- cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index);
t = ts->last_seen + (f64) ts->lifetime;
- clib_rwlock_reader_unlock (&cnat_main.ts_lock);
return t;
}
always_inline void
cnat_timestamp_free (u32 index)
{
- if (INDEX_INVALID == index)
+ cnat_timestamp_t *ts = cnat_timestamp_get_if_valid (index);
+ if (NULL == ts)
return;
- clib_rwlock_writer_lock (&cnat_main.ts_lock);
- cnat_timestamp_t *ts = pool_elt_at_index (cnat_timestamps, index);
- ts->refcnt--;
- if (0 == ts->refcnt)
- pool_put (cnat_timestamps, ts);
- clib_rwlock_writer_unlock (&cnat_main.ts_lock);
+ if (0 == clib_atomic_sub_fetch (&ts->refcnt, 1))
+ cnat_timestamp_destroy (index);
}
/*
diff --git a/src/plugins/cnat/cnat_maglev.c b/src/plugins/cnat/cnat_maglev.c
new file mode 100644
index 00000000000..2cdb868b3d7
--- /dev/null
+++ b/src/plugins/cnat/cnat_maglev.c
@@ -0,0 +1,379 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2022 Cisco Systems, Inc.
+ */
+
+#include <cnat/cnat_maglev.h>
+
+static int
+cnat_maglev_perm_compare (void *_a, void *_b)
+{
+ return *(u64 *) _b - *(u64 *) _a;
+}
+
+/**
+ * Maglev algorithm implementation. This takes permutation as input,
+ * with the values of offset & skip for the backends.
+ * It fills buckets matching the permuntations, provided buckets is
+ * already of length at least M
+ */
+static void
+cnat_maglev_shuffle (cnat_maglev_perm_t *permutation, u32 *buckets)
+{
+ u32 N, M, i, done = 0;
+ u32 *next = 0;
+
+ N = vec_len (permutation);
+ if (N == 0)
+ return;
+
+ M = vec_len (buckets);
+ if (M == 0)
+ return;
+ vec_set (buckets, -1);
+
+ vec_validate (next, N - 1);
+ vec_zero (next);
+
+ while (1)
+ {
+ for (i = 0; i < N; i++)
+ {
+ u32 c = (permutation[i].offset + next[i] * permutation[i].skip) % M;
+ while (buckets[c] != (u32) -1)
+ {
+ next[i]++;
+ c = (permutation[i].offset + next[i] * permutation[i].skip) % M;
+ }
+
+ buckets[c] = permutation[i].index;
+ next[i]++;
+ done++;
+
+ if (done == M)
+ {
+ vec_free (next);
+ return;
+ }
+ }
+ }
+}
+
+void
+cnat_translation_init_maglev (cnat_translation_t *ct)
+{
+ cnat_maglev_perm_t *permutations = NULL;
+ cnat_main_t *cm = &cnat_main;
+ cnat_ep_trk_t *trk;
+ u32 backend_index = 0;
+
+ if (vec_len (ct->ct_active_paths) == 0)
+ return;
+
+ vec_foreach (trk, ct->ct_active_paths)
+ {
+ cnat_maglev_perm_t permutation;
+ u32 h1, h2;
+
+ if (AF_IP4 == ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip))
+ {
+ u32 a, b, c;
+ a = ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32;
+ b = (u64) trk->ct_ep[VLIB_TX].ce_port;
+ c = 0;
+ hash_v3_mix32 (a, b, c);
+ hash_v3_finalize32 (a, b, c);
+ h1 = c;
+ h2 = b;
+ }
+ else
+ {
+ u64 a, b, c;
+ a = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[0];
+ b = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[1];
+ c = (u64) trk->ct_ep[VLIB_TX].ce_port;
+ hash_mix64 (a, b, c);
+ h1 = c;
+ h2 = b;
+ }
+
+ permutation.offset = h1 % cm->maglev_len;
+ permutation.skip = h2 % (cm->maglev_len - 1) + 1;
+ permutation.index = backend_index++;
+
+ if (trk->ct_flags & CNAT_TRK_FLAG_TEST_DISABLED)
+ continue;
+
+ vec_add1 (permutations, permutation);
+ }
+
+ vec_sort_with_function (permutations, cnat_maglev_perm_compare);
+
+ vec_validate (ct->lb_maglev, cm->maglev_len - 1);
+
+ cnat_maglev_shuffle (permutations, ct->lb_maglev);
+
+ vec_free (permutations);
+}
+
+static int
+cnat_u32_vec_contains (u32 *v, u32 e)
+{
+ int i;
+
+ vec_foreach_index (i, v)
+ if (v[i] == e)
+ return 1;
+
+ return 0;
+}
+
+static void
+cnat_maglev_print_changes (vlib_main_t *vm, u32 *changed_bk_indices,
+ u32 *old_maglev_lb, u32 *new_maglev_lb)
+{
+ u32 good_flow_buckets = 0, reset_flow_buckets = 0, stable_to_reset = 0;
+ u32 reset_to_stable = 0, switched_stable = 0;
+ if (vec_len (new_maglev_lb) == 0)
+ return;
+ for (u32 i = 0; i < vec_len (new_maglev_lb); i++)
+ {
+ u8 is_new_changed =
+ cnat_u32_vec_contains (changed_bk_indices, new_maglev_lb[i]);
+ u8 is_old_changed =
+ cnat_u32_vec_contains (changed_bk_indices, old_maglev_lb[i]);
+ if (new_maglev_lb[i] == old_maglev_lb[i])
+ {
+ if (is_new_changed)
+ reset_flow_buckets++;
+ else
+ good_flow_buckets++;
+ }
+ else
+ {
+ if (is_new_changed)
+ stable_to_reset++;
+ else if (is_old_changed)
+ reset_to_stable++;
+ else
+ switched_stable++;
+ }
+ }
+ vlib_cli_output (vm,
+ "good B->B:%d | lost A->A':%d A->B:%d ~%0.2f%% | bad "
+ "B->A':%d B->C:%d ~%0.2f%%",
+ good_flow_buckets, reset_flow_buckets, reset_to_stable,
+ (f64) (reset_flow_buckets + reset_to_stable) /
+ vec_len (new_maglev_lb) * 100.0,
+ stable_to_reset, switched_stable,
+ (f64) (stable_to_reset + switched_stable) /
+ vec_len (new_maglev_lb) * 100.0);
+}
+
+static u8 *
+format_cnat_maglev_buckets (u8 *s, va_list *args)
+{
+ u32 *buckets = va_arg (*args, u32 *);
+ u32 backend_idx = va_arg (*args, u32);
+ u32 count = va_arg (*args, u32);
+
+ for (u32 ii = 0; ii < vec_len (buckets); ii++)
+ if (buckets[ii] == backend_idx)
+ {
+ s = format (s, "%d,", ii);
+ if (--count == 0)
+ return (s);
+ }
+ return (s);
+}
+
+static clib_error_t *
+cnat_translation_test_init_maglev (vlib_main_t *vm, unformat_input_t *input,
+ vlib_cli_command_t *cmd)
+{
+ cnat_translation_t *trs = 0, *ct;
+ u64 num_backends = 0, n_tests = 0;
+ cnat_main_t *cm = &cnat_main;
+ cnat_ep_trk_t *trk;
+ u32 rnd;
+ u32 n_changes = 0, n_remove = 0, verbose = 0;
+
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (input, "tests %d", &n_tests))
+ ;
+ else if (unformat (input, "backends %d", &num_backends))
+ ;
+ else if (unformat (input, "len %d", &cm->maglev_len))
+ ;
+ else if (unformat (input, "change %d", &n_changes))
+ ;
+ else if (unformat (input, "rm %d", &n_remove))
+ ;
+ else if (unformat (input, "verbose %d", &verbose))
+ ;
+ else
+ return (clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, input));
+ }
+
+ if (num_backends == 0 || n_tests == 0)
+ return (clib_error_return (0, "No backends / tests to run"));
+ ;
+
+ vlib_cli_output (vm, "generating random backends...");
+ rnd = random_default_seed ();
+
+ vec_validate (trs, n_tests - 1);
+ vec_foreach (ct, trs)
+ {
+ vec_validate (ct->ct_active_paths, num_backends - 1);
+ vec_foreach (trk, ct->ct_active_paths)
+ {
+ trk->ct_flags = 0;
+ ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip) = AF_IP4;
+ ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32 = random_u32 (&rnd);
+ trk->ct_ep[VLIB_TX].ce_port = random_u32 (&rnd);
+ }
+ }
+
+ vlib_cli_output (vm, "testing...");
+ f64 start_time = vlib_time_now (vm);
+ vec_foreach (ct, trs)
+ cnat_translation_init_maglev (ct);
+ f64 d = vlib_time_now (vm) - start_time;
+
+ vlib_cli_output (vm, "Test took : %U", format_duration, d);
+ vlib_cli_output (vm, "Per pool : %U", format_duration, d / n_tests);
+
+ /* sanity checking of the output */
+ u32 *backend_freqs = 0;
+ vec_validate (backend_freqs, num_backends - 1);
+ vec_foreach (ct, trs)
+ {
+ if (vec_len (ct->lb_maglev) != cm->maglev_len)
+ vlib_cli_output (vm, "Unexpected bucket length %d",
+ vec_len (ct->lb_maglev));
+
+ vec_zero (backend_freqs);
+ for (u32 i = 0; i < vec_len (ct->lb_maglev); i++)
+ {
+ if (ct->lb_maglev[i] >= num_backends)
+ clib_warning ("out of bound backend");
+ backend_freqs[ct->lb_maglev[i]]++;
+ }
+ u32 fmin = ~0, fmax = 0;
+ for (u32 i = 0; i < num_backends; i++)
+ {
+ if (backend_freqs[i] > fmax)
+ fmax = backend_freqs[i];
+ if (backend_freqs[i] < fmin)
+ fmin = backend_freqs[i];
+ }
+ f64 fdiff = (fmax - fmin);
+ if (fdiff / vec_len (ct->lb_maglev) - 1 > 0.02)
+ vlib_cli_output (vm, "More than 2%% frequency diff (min %d max %d)",
+ fmin, fmax);
+ }
+ vec_free (backend_freqs);
+
+ int i = 0;
+ if (verbose)
+ vec_foreach (ct, trs)
+ {
+ vlib_cli_output (vm, "Translation %d", i++);
+ for (u32 i = 0; i < verbose; i++)
+ {
+ u32 j = random_u32 (&rnd) % vec_len (ct->ct_active_paths);
+ trk = &ct->ct_active_paths[j];
+ vlib_cli_output (
+ vm, "[%03d] %U:%d buckets:%U", j, format_ip_address,
+ &trk->ct_ep[VLIB_TX].ce_ip, trk->ct_ep[VLIB_TX].ce_port,
+ format_cnat_maglev_buckets, ct->lb_maglev, j, verbose);
+ }
+ }
+
+ if (n_remove != 0)
+ {
+ vlib_cli_output (
+ vm, "Removing %d entries (refered to as A), others (B,C) stay same",
+ n_remove);
+ vec_foreach (ct, trs)
+ {
+ u32 *old_maglev_lb = 0;
+ u32 *changed_bk_indices = 0;
+ if (vec_len (ct->lb_maglev) != cm->maglev_len)
+ vlib_cli_output (vm, "Unexpected bucket length %d",
+ vec_len (ct->lb_maglev));
+
+ vec_validate (changed_bk_indices, n_remove - 1);
+ for (u32 i = 0; i < n_remove; i++)
+ {
+ /* remove n_remove backends from the LB set */
+ changed_bk_indices[i] =
+ random_u32 (&rnd) % vec_len (ct->ct_active_paths);
+ trk = &ct->ct_active_paths[changed_bk_indices[i]];
+ trk->ct_flags |= CNAT_TRK_FLAG_TEST_DISABLED;
+ }
+
+ old_maglev_lb = vec_dup (ct->lb_maglev);
+ cnat_translation_init_maglev (ct);
+
+ cnat_maglev_print_changes (vm, changed_bk_indices, old_maglev_lb,
+ ct->lb_maglev);
+
+ vec_free (changed_bk_indices);
+ vec_free (old_maglev_lb);
+ }
+ }
+
+ /* Reshuffle and check changes */
+ if (n_changes != 0)
+ {
+ vlib_cli_output (
+ vm,
+ "Changing %d entries (refered to as A->A'), others (B,C) stay same",
+ n_changes);
+ vec_foreach (ct, trs)
+ {
+ if (vec_len (ct->lb_maglev) != cm->maglev_len)
+ vlib_cli_output (vm, "Unexpected bucket length %d",
+ vec_len (ct->lb_maglev));
+
+ u32 *old_maglev_lb = 0;
+ u32 *changed_bk_indices = 0;
+
+ vec_validate (changed_bk_indices, n_changes - 1);
+ for (u32 i = 0; i < n_changes; i++)
+ {
+ /* Change n_changes backends in the LB set */
+ changed_bk_indices[i] =
+ random_u32 (&rnd) % vec_len (ct->ct_active_paths);
+ trk = &ct->ct_active_paths[changed_bk_indices[i]];
+ ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32 =
+ random_u32 (&rnd);
+ trk->ct_ep[VLIB_TX].ce_port = random_u32 (&rnd) & 0xffff;
+ }
+ old_maglev_lb = vec_dup (ct->lb_maglev);
+
+ cnat_translation_init_maglev (ct);
+ cnat_maglev_print_changes (vm, changed_bk_indices, old_maglev_lb,
+ ct->lb_maglev);
+
+ vec_free (changed_bk_indices);
+ vec_free (old_maglev_lb);
+ }
+ }
+
+ vec_foreach (ct, trs)
+ vec_free (ct->ct_active_paths);
+ vec_free (trs);
+
+ return (NULL);
+}
+
+VLIB_CLI_COMMAND (cnat_translation_test_init_maglev_cmd, static) = {
+ .path = "test cnat maglev",
+ .short_help = "test cnat maglev tests [n_tests] backends [num_backends] len "
+ "[maglev_len]",
+ .function = cnat_translation_test_init_maglev,
+};
diff --git a/src/plugins/cnat/cnat_maglev.h b/src/plugins/cnat/cnat_maglev.h
new file mode 100644
index 00000000000..a71dd3ce796
--- /dev/null
+++ b/src/plugins/cnat/cnat_maglev.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright(c) 2022 Cisco Systems, Inc.
+ */
+
+#ifndef __CNAT_MAGLEV_H__
+#define __CNAT_MAGLEV_H__
+
+#include <cnat/cnat_types.h>
+#include <cnat/cnat_translation.h>
+
+typedef struct
+{
+ /* offset & skip used for sorting, should be first */
+ u32 offset;
+ u32 skip;
+ u32 index;
+} cnat_maglev_perm_t;
+
+extern void cnat_translation_init_maglev (cnat_translation_t *ct);
+
+#endif \ No newline at end of file
diff --git a/src/plugins/cnat/cnat_node.h b/src/plugins/cnat/cnat_node.h
index 246fdb8ba57..d81f6745bc4 100644
--- a/src/plugins/cnat/cnat_node.h
+++ b/src/plugins/cnat/cnat_node.h
@@ -19,6 +19,7 @@
#include <vlibmemory/api.h>
#include <vnet/dpo/load_balance.h>
#include <vnet/dpo/load_balance_map.h>
+#include <vnet/ip/ip_psh_cksum.h>
#include <cnat/cnat_session.h>
#include <cnat/cnat_client.h>
@@ -169,86 +170,92 @@ cmp_ip6_address (const ip6_address_t * a1, const ip6_address_t * a2)
* Inline translation functions
*/
-static_always_inline u8
-has_ip6_address (ip6_address_t * a)
+static_always_inline u16
+ip4_pseudo_header_cksum2 (ip4_header_t *ip4, ip4_address_t address[VLIB_N_DIR])
{
- return ((0 != a->as_u64[0]) || (0 != a->as_u64[1]));
+ ip4_psh_t psh = { 0 };
+ psh.src = address[VLIB_RX];
+ psh.dst = address[VLIB_TX];
+ psh.proto = ip4->protocol;
+ psh.l4len = clib_host_to_net_u16 (clib_net_to_host_u16 (ip4->length) -
+ sizeof (ip4_header_t));
+ return ~(clib_ip_csum ((u8 *) &psh, sizeof (ip4_psh_t)));
}
static_always_inline void
-cnat_ip4_translate_l4 (ip4_header_t * ip4, udp_header_t * udp,
- ip_csum_t * sum,
+cnat_ip4_translate_l4 (ip4_header_t *ip4, udp_header_t *udp, ip_csum_t *sum,
ip4_address_t new_addr[VLIB_N_DIR],
- u16 new_port[VLIB_N_DIR])
+ u16 new_port[VLIB_N_DIR], u32 oflags)
{
u16 old_port[VLIB_N_DIR];
- ip4_address_t old_addr[VLIB_N_DIR];
+ old_port[VLIB_TX] = udp->dst_port;
+ old_port[VLIB_RX] = udp->src_port;
- /* Fastpath no checksum */
- if (PREDICT_TRUE (0 == *sum))
+ udp->dst_port = new_port[VLIB_TX];
+ udp->src_port = new_port[VLIB_RX];
+
+ if (oflags &
+ (VNET_BUFFER_OFFLOAD_F_TCP_CKSUM | VNET_BUFFER_OFFLOAD_F_UDP_CKSUM))
{
- udp->dst_port = new_port[VLIB_TX];
- udp->src_port = new_port[VLIB_RX];
+ *sum = ip4_pseudo_header_cksum2 (ip4, new_addr);
return;
}
- old_port[VLIB_TX] = udp->dst_port;
- old_port[VLIB_RX] = udp->src_port;
- old_addr[VLIB_TX] = ip4->dst_address;
- old_addr[VLIB_RX] = ip4->src_address;
+ *sum = ip_csum_update (*sum, ip4->dst_address.as_u32,
+ new_addr[VLIB_TX].as_u32, ip4_header_t, dst_address);
+ *sum = ip_csum_update (*sum, ip4->src_address.as_u32,
+ new_addr[VLIB_RX].as_u32, ip4_header_t, src_address);
- if (new_addr[VLIB_TX].as_u32)
+ *sum = ip_csum_update (*sum, old_port[VLIB_TX], new_port[VLIB_TX],
+ udp_header_t, dst_port);
+ *sum = ip_csum_update (*sum, old_port[VLIB_RX], new_port[VLIB_RX],
+ udp_header_t, src_port);
+}
+
+static_always_inline void
+cnat_ip4_translate_sctp (ip4_header_t *ip4, sctp_header_t *sctp,
+ u16 new_port[VLIB_N_DIR])
+{
+ /* Fastpath no checksum */
+ if (PREDICT_TRUE (0 == sctp->checksum))
{
- *sum =
- ip_csum_update (*sum, old_addr[VLIB_TX].as_u32,
- new_addr[VLIB_TX].as_u32, ip4_header_t, dst_address);
+ sctp->dst_port = new_port[VLIB_TX];
+ sctp->src_port = new_port[VLIB_RX];
+ return;
}
+
if (new_port[VLIB_TX])
- {
- udp->dst_port = new_port[VLIB_TX];
- *sum = ip_csum_update (*sum, old_port[VLIB_TX], new_port[VLIB_TX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
- }
- if (new_addr[VLIB_RX].as_u32)
- {
- *sum =
- ip_csum_update (*sum, old_addr[VLIB_RX].as_u32,
- new_addr[VLIB_RX].as_u32, ip4_header_t, src_address);
- }
+ sctp->dst_port = new_port[VLIB_TX];
if (new_port[VLIB_RX])
- {
- udp->src_port = new_port[VLIB_RX];
- *sum = ip_csum_update (*sum, old_port[VLIB_RX], new_port[VLIB_RX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
- }
+ sctp->src_port = new_port[VLIB_RX];
+
+ sctp->checksum = 0;
+ sctp->checksum = clib_host_to_little_u32 (~clib_crc32c_with_init (
+ (u8 *) sctp, ntohs (ip4->length) - sizeof (ip4_header_t),
+ ~0 /* init value */));
}
static_always_inline void
-cnat_ip4_translate_l3 (ip4_header_t * ip4, ip4_address_t new_addr[VLIB_N_DIR])
+cnat_ip4_translate_l3 (ip4_header_t *ip4, ip4_address_t new_addr[VLIB_N_DIR],
+ u32 oflags)
{
ip4_address_t old_addr[VLIB_N_DIR];
ip_csum_t sum;
-
old_addr[VLIB_TX] = ip4->dst_address;
old_addr[VLIB_RX] = ip4->src_address;
+ ip4->dst_address = new_addr[VLIB_TX];
+ ip4->src_address = new_addr[VLIB_RX];
+
+ // We always compute the IP checksum even if oflags &
+ // VNET_BUFFER_OFFLOAD_F_IP_CKSUM is set as this is relatively inexpensive
+ // and will allow avoiding issues in driver that do not behave properly
+ // downstream.
sum = ip4->checksum;
- if (new_addr[VLIB_TX].as_u32)
- {
- ip4->dst_address = new_addr[VLIB_TX];
- sum =
- ip_csum_update (sum, old_addr[VLIB_TX].as_u32,
+ sum = ip_csum_update (sum, old_addr[VLIB_TX].as_u32,
new_addr[VLIB_TX].as_u32, ip4_header_t, dst_address);
- }
- if (new_addr[VLIB_RX].as_u32)
- {
- ip4->src_address = new_addr[VLIB_RX];
- sum =
- ip_csum_update (sum, old_addr[VLIB_RX].as_u32,
+ sum = ip_csum_update (sum, old_addr[VLIB_RX].as_u32,
new_addr[VLIB_RX].as_u32, ip4_header_t, src_address);
- }
ip4->checksum = ip_csum_fold (sum);
}
@@ -257,48 +264,40 @@ cnat_tcp_update_session_lifetime (tcp_header_t * tcp, u32 index)
{
cnat_main_t *cm = &cnat_main;
if (PREDICT_FALSE (tcp_fin (tcp)))
- {
- cnat_timestamp_set_lifetime (index, CNAT_DEFAULT_TCP_RST_TIMEOUT);
- }
+ cnat_timestamp_set_lifetime (index, CNAT_DEFAULT_TCP_RST_TIMEOUT);
if (PREDICT_FALSE (tcp_rst (tcp)))
- {
- cnat_timestamp_set_lifetime (index, CNAT_DEFAULT_TCP_RST_TIMEOUT);
- }
+ cnat_timestamp_set_lifetime (index, CNAT_DEFAULT_TCP_RST_TIMEOUT);
if (PREDICT_FALSE (tcp_syn (tcp) && tcp_ack (tcp)))
- {
- cnat_timestamp_set_lifetime (index, cm->tcp_max_age);
- }
+ cnat_timestamp_set_lifetime (index, cm->tcp_max_age);
}
static_always_inline void
-cnat_translation_icmp4_echo (ip4_header_t * ip4, icmp46_header_t * icmp,
+cnat_translation_icmp4_echo (ip4_header_t *ip4, icmp46_header_t *icmp,
ip4_address_t new_addr[VLIB_N_DIR],
- u16 new_port[VLIB_N_DIR])
+ u16 new_port[VLIB_N_DIR], u32 oflags)
{
ip_csum_t sum;
u16 old_port;
cnat_echo_header_t *echo = (cnat_echo_header_t *) (icmp + 1);
- cnat_ip4_translate_l3 (ip4, new_addr);
+ cnat_ip4_translate_l3 (ip4, new_addr, oflags);
old_port = echo->identifier;
echo->identifier = new_port[VLIB_RX];
sum = icmp->checksum;
- sum = ip_csum_update (sum, old_port, new_port[VLIB_RX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
+ sum =
+ ip_csum_update (sum, old_port, new_port[VLIB_RX], udp_header_t, src_port);
icmp->checksum = ip_csum_fold (sum);
}
static_always_inline void
-cnat_translation_icmp4_error (ip4_header_t * outer_ip4,
- icmp46_header_t * icmp,
+cnat_translation_icmp4_error (ip4_header_t *outer_ip4, icmp46_header_t *icmp,
ip4_address_t outer_new_addr[VLIB_N_DIR],
- u16 outer_new_port[VLIB_N_DIR],
- u8 snat_outer_ip)
+ u16 outer_new_port[VLIB_N_DIR], u8 snat_outer_ip,
+ u32 oflags)
{
ip4_address_t new_addr[VLIB_N_DIR];
ip4_address_t old_addr[VLIB_N_DIR];
@@ -327,18 +326,20 @@ cnat_translation_icmp4_error (ip4_header_t * outer_ip4,
/* translate outer ip. */
if (!snat_outer_ip)
outer_new_addr[VLIB_RX] = outer_ip4->src_address;
- cnat_ip4_translate_l3 (outer_ip4, outer_new_addr);
+ cnat_ip4_translate_l3 (outer_ip4, outer_new_addr, oflags);
if (ip4->protocol == IP_PROTOCOL_TCP)
{
inner_l4_old_sum = inner_l4_sum = tcp->checksum;
- cnat_ip4_translate_l4 (ip4, udp, &inner_l4_sum, new_addr, new_port);
+ cnat_ip4_translate_l4 (ip4, udp, &inner_l4_sum, new_addr, new_port,
+ 0 /* flags */);
tcp->checksum = ip_csum_fold (inner_l4_sum);
}
else if (ip4->protocol == IP_PROTOCOL_UDP)
{
inner_l4_old_sum = inner_l4_sum = udp->checksum;
- cnat_ip4_translate_l4 (ip4, udp, &inner_l4_sum, new_addr, new_port);
+ cnat_ip4_translate_l4 (ip4, udp, &inner_l4_sum, new_addr, new_port,
+ 0 /* flags */);
udp->checksum = ip_csum_fold (inner_l4_sum);
}
else
@@ -351,37 +352,30 @@ cnat_translation_icmp4_error (ip4_header_t * outer_ip4,
/* UDP/TCP Ports changed */
if (old_port[VLIB_TX] && new_port[VLIB_TX])
sum = ip_csum_update (sum, old_port[VLIB_TX], new_port[VLIB_TX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
+ udp_header_t, dst_port);
if (old_port[VLIB_RX] && new_port[VLIB_RX])
sum = ip_csum_update (sum, old_port[VLIB_RX], new_port[VLIB_RX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
-
+ udp_header_t, src_port);
- cnat_ip4_translate_l3 (ip4, new_addr);
+ cnat_ip4_translate_l3 (ip4, new_addr, 0 /* oflags */);
ip_csum_t new_ip_sum = ip4->checksum;
/* IP checksum changed */
sum = ip_csum_update (sum, old_ip_sum, new_ip_sum, ip4_header_t, checksum);
/* IP src/dst addr changed */
- if (new_addr[VLIB_TX].as_u32)
- sum =
- ip_csum_update (sum, old_addr[VLIB_TX].as_u32, new_addr[VLIB_TX].as_u32,
- ip4_header_t, dst_address);
+ sum = ip_csum_update (sum, old_addr[VLIB_TX].as_u32,
+ new_addr[VLIB_TX].as_u32, ip4_header_t, dst_address);
- if (new_addr[VLIB_RX].as_u32)
- sum =
- ip_csum_update (sum, old_addr[VLIB_RX].as_u32, new_addr[VLIB_RX].as_u32,
- ip4_header_t, src_address);
+ sum = ip_csum_update (sum, old_addr[VLIB_RX].as_u32,
+ new_addr[VLIB_RX].as_u32, ip4_header_t, src_address);
icmp->checksum = ip_csum_fold (sum);
}
static_always_inline void
-cnat_translation_ip4 (const cnat_session_t * session,
- ip4_header_t * ip4, udp_header_t * udp)
+cnat_translation_ip4 (const cnat_session_t *session, ip4_header_t *ip4,
+ udp_header_t *udp, u32 oflags)
{
tcp_header_t *tcp = (tcp_header_t *) udp;
ip4_address_t new_addr[VLIB_N_DIR];
@@ -395,17 +389,23 @@ cnat_translation_ip4 (const cnat_session_t * session,
if (ip4->protocol == IP_PROTOCOL_TCP)
{
ip_csum_t sum = tcp->checksum;
- cnat_ip4_translate_l4 (ip4, udp, &sum, new_addr, new_port);
+ cnat_ip4_translate_l4 (ip4, udp, &sum, new_addr, new_port, oflags);
tcp->checksum = ip_csum_fold (sum);
- cnat_ip4_translate_l3 (ip4, new_addr);
+ cnat_ip4_translate_l3 (ip4, new_addr, oflags);
cnat_tcp_update_session_lifetime (tcp, session->value.cs_ts_index);
}
else if (ip4->protocol == IP_PROTOCOL_UDP)
{
ip_csum_t sum = udp->checksum;
- cnat_ip4_translate_l4 (ip4, udp, &sum, new_addr, new_port);
+ cnat_ip4_translate_l4 (ip4, udp, &sum, new_addr, new_port, oflags);
udp->checksum = ip_csum_fold (sum);
- cnat_ip4_translate_l3 (ip4, new_addr);
+ cnat_ip4_translate_l3 (ip4, new_addr, oflags);
+ }
+ else if (ip4->protocol == IP_PROTOCOL_SCTP)
+ {
+ sctp_header_t *sctp = (sctp_header_t *) udp;
+ cnat_ip4_translate_sctp (ip4, sctp, new_port);
+ cnat_ip4_translate_l3 (ip4, new_addr, oflags);
}
else if (ip4->protocol == IP_PROTOCOL_ICMP)
{
@@ -417,74 +417,65 @@ cnat_translation_ip4 (const cnat_session_t * session,
(ip4->src_address.as_u32 ==
session->key.cs_ip[VLIB_RX].ip4.as_u32);
cnat_translation_icmp4_error (ip4, icmp, new_addr, new_port,
- snat_outer_ip);
+ snat_outer_ip, oflags);
}
else if (icmp_type_is_echo (icmp->type))
- cnat_translation_icmp4_echo (ip4, icmp, new_addr, new_port);
+ cnat_translation_icmp4_echo (ip4, icmp, new_addr, new_port, oflags);
}
}
static_always_inline void
cnat_ip6_translate_l3 (ip6_header_t * ip6, ip6_address_t new_addr[VLIB_N_DIR])
{
- if (has_ip6_address (&new_addr[VLIB_TX]))
- ip6_address_copy (&ip6->dst_address, &new_addr[VLIB_TX]);
- if (has_ip6_address (&new_addr[VLIB_RX]))
- ip6_address_copy (&ip6->src_address, &new_addr[VLIB_RX]);
+ ip6_address_copy (&ip6->dst_address, &new_addr[VLIB_TX]);
+ ip6_address_copy (&ip6->src_address, &new_addr[VLIB_RX]);
+}
+
+static_always_inline u16
+ip6_pseudo_header_cksum2 (ip6_header_t *ip6, ip6_address_t address[VLIB_N_DIR])
+{
+ ip6_psh_t psh = { 0 };
+ psh.src = address[VLIB_RX];
+ psh.dst = address[VLIB_TX];
+ psh.l4len = ip6->payload_length;
+ psh.proto = clib_host_to_net_u32 ((u32) ip6->protocol);
+ return ~(clib_ip_csum ((u8 *) &psh, sizeof (ip6_psh_t)));
}
static_always_inline void
-cnat_ip6_translate_l4 (ip6_header_t * ip6, udp_header_t * udp,
- ip_csum_t * sum,
+cnat_ip6_translate_l4 (ip6_header_t *ip6, udp_header_t *udp, ip_csum_t *sum,
ip6_address_t new_addr[VLIB_N_DIR],
- u16 new_port[VLIB_N_DIR])
+ u16 new_port[VLIB_N_DIR], u32 oflags)
{
u16 old_port[VLIB_N_DIR];
- ip6_address_t old_addr[VLIB_N_DIR];
+ old_port[VLIB_TX] = udp->dst_port;
+ old_port[VLIB_RX] = udp->src_port;
- /* Fastpath no checksum */
- if (PREDICT_TRUE (0 == *sum))
+ udp->dst_port = new_port[VLIB_TX];
+ udp->src_port = new_port[VLIB_RX];
+
+ if (oflags &
+ (VNET_BUFFER_OFFLOAD_F_TCP_CKSUM | VNET_BUFFER_OFFLOAD_F_UDP_CKSUM))
{
- udp->dst_port = new_port[VLIB_TX];
- udp->src_port = new_port[VLIB_RX];
+ *sum = ip6_pseudo_header_cksum2 (ip6, new_addr);
return;
}
- old_port[VLIB_TX] = udp->dst_port;
- old_port[VLIB_RX] = udp->src_port;
- ip6_address_copy (&old_addr[VLIB_TX], &ip6->dst_address);
- ip6_address_copy (&old_addr[VLIB_RX], &ip6->src_address);
+ *sum = ip_csum_add_even (*sum, new_addr[VLIB_TX].as_u64[0]);
+ *sum = ip_csum_add_even (*sum, new_addr[VLIB_TX].as_u64[1]);
+ *sum = ip_csum_sub_even (*sum, ip6->dst_address.as_u64[0]);
+ *sum = ip_csum_sub_even (*sum, ip6->dst_address.as_u64[1]);
- if (has_ip6_address (&new_addr[VLIB_TX]))
- {
- *sum = ip_csum_add_even (*sum, new_addr[VLIB_TX].as_u64[0]);
- *sum = ip_csum_add_even (*sum, new_addr[VLIB_TX].as_u64[1]);
- *sum = ip_csum_sub_even (*sum, old_addr[VLIB_TX].as_u64[0]);
- *sum = ip_csum_sub_even (*sum, old_addr[VLIB_TX].as_u64[1]);
- }
+ *sum = ip_csum_add_even (*sum, new_addr[VLIB_RX].as_u64[0]);
+ *sum = ip_csum_add_even (*sum, new_addr[VLIB_RX].as_u64[1]);
+ *sum = ip_csum_sub_even (*sum, ip6->src_address.as_u64[0]);
+ *sum = ip_csum_sub_even (*sum, ip6->src_address.as_u64[1]);
- if (new_port[VLIB_TX])
- {
- udp->dst_port = new_port[VLIB_TX];
- *sum = ip_csum_update (*sum, old_port[VLIB_TX], new_port[VLIB_TX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
- }
- if (has_ip6_address (&new_addr[VLIB_RX]))
- {
- *sum = ip_csum_add_even (*sum, new_addr[VLIB_RX].as_u64[0]);
- *sum = ip_csum_add_even (*sum, new_addr[VLIB_RX].as_u64[1]);
- *sum = ip_csum_sub_even (*sum, old_addr[VLIB_RX].as_u64[0]);
- *sum = ip_csum_sub_even (*sum, old_addr[VLIB_RX].as_u64[1]);
- }
+ *sum = ip_csum_update (*sum, old_port[VLIB_TX], new_port[VLIB_TX],
+ udp_header_t, dst_port);
- if (new_port[VLIB_RX])
- {
- udp->src_port = new_port[VLIB_RX];
- *sum = ip_csum_update (*sum, old_port[VLIB_RX], new_port[VLIB_RX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
- }
+ *sum = ip_csum_update (*sum, old_port[VLIB_RX], new_port[VLIB_RX],
+ udp_header_t, src_port);
}
static_always_inline void
@@ -503,26 +494,20 @@ cnat_translation_icmp6_echo (ip6_header_t * ip6, icmp46_header_t * icmp,
sum = icmp->checksum;
cnat_ip6_translate_l3 (ip6, new_addr);
- if (has_ip6_address (&new_addr[VLIB_TX]))
- {
- sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[0]);
- sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[1]);
- sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[0]);
- sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[1]);
- }
- if (has_ip6_address (&new_addr[VLIB_RX]))
- {
- sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[0]);
- sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[1]);
- sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[0]);
- sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[1]);
- }
+ sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[0]);
+ sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[1]);
+ sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[0]);
+ sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[1]);
+
+ sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[0]);
+ sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[1]);
+ sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[0]);
+ sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[1]);
echo->identifier = new_port[VLIB_RX];
- sum = ip_csum_update (sum, old_port, new_port[VLIB_RX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
+ sum =
+ ip_csum_update (sum, old_port, new_port[VLIB_RX], udp_header_t, src_port);
icmp->checksum = ip_csum_fold (sum);
}
@@ -566,79 +551,64 @@ cnat_translation_icmp6_error (ip6_header_t * outer_ip6,
if (!snat_outer_ip)
ip6_address_copy (&outer_new_addr[VLIB_RX], &outer_ip6->src_address);
cnat_ip6_translate_l3 (outer_ip6, outer_new_addr);
- if (has_ip6_address (&outer_new_addr[VLIB_TX]))
- {
- sum = ip_csum_add_even (sum, outer_new_addr[VLIB_TX].as_u64[0]);
- sum = ip_csum_add_even (sum, outer_new_addr[VLIB_TX].as_u64[1]);
- sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_TX].as_u64[0]);
- sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_TX].as_u64[1]);
- }
- if (has_ip6_address (&outer_new_addr[VLIB_RX]))
- {
- sum = ip_csum_add_even (sum, outer_new_addr[VLIB_RX].as_u64[0]);
- sum = ip_csum_add_even (sum, outer_new_addr[VLIB_RX].as_u64[1]);
- sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_RX].as_u64[0]);
- sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_RX].as_u64[1]);
- }
+ sum = ip_csum_add_even (sum, outer_new_addr[VLIB_TX].as_u64[0]);
+ sum = ip_csum_add_even (sum, outer_new_addr[VLIB_TX].as_u64[1]);
+ sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_TX].as_u64[0]);
+ sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_TX].as_u64[1]);
+
+ sum = ip_csum_add_even (sum, outer_new_addr[VLIB_RX].as_u64[0]);
+ sum = ip_csum_add_even (sum, outer_new_addr[VLIB_RX].as_u64[1]);
+ sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_RX].as_u64[0]);
+ sum = ip_csum_sub_even (sum, outer_old_addr[VLIB_RX].as_u64[1]);
/* Translate inner TCP / UDP */
if (ip6->protocol == IP_PROTOCOL_TCP)
{
inner_l4_old_sum = inner_l4_sum = tcp->checksum;
- cnat_ip6_translate_l4 (ip6, udp, &inner_l4_sum, new_addr, new_port);
+ cnat_ip6_translate_l4 (ip6, udp, &inner_l4_sum, new_addr, new_port,
+ 0 /* oflags */);
tcp->checksum = ip_csum_fold (inner_l4_sum);
}
else if (ip6->protocol == IP_PROTOCOL_UDP)
{
inner_l4_old_sum = inner_l4_sum = udp->checksum;
- cnat_ip6_translate_l4 (ip6, udp, &inner_l4_sum, new_addr, new_port);
+ cnat_ip6_translate_l4 (ip6, udp, &inner_l4_sum, new_addr, new_port,
+ 0 /* oflags */);
udp->checksum = ip_csum_fold (inner_l4_sum);
}
else
return;
/* UDP/TCP checksum changed */
- sum = ip_csum_update (sum, inner_l4_old_sum, inner_l4_sum,
- ip4_header_t /* cheat */ ,
+ sum = ip_csum_update (sum, inner_l4_old_sum, inner_l4_sum, ip4_header_t,
checksum);
/* UDP/TCP Ports changed */
- if (old_port[VLIB_TX] && new_port[VLIB_TX])
- sum = ip_csum_update (sum, old_port[VLIB_TX], new_port[VLIB_TX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
-
- if (old_port[VLIB_RX] && new_port[VLIB_RX])
- sum = ip_csum_update (sum, old_port[VLIB_RX], new_port[VLIB_RX],
- ip4_header_t /* cheat */ ,
- length /* changed member */ );
+ sum = ip_csum_update (sum, old_port[VLIB_TX], new_port[VLIB_TX],
+ udp_header_t, dst_port);
+ sum = ip_csum_update (sum, old_port[VLIB_RX], new_port[VLIB_RX],
+ udp_header_t, src_port);
cnat_ip6_translate_l3 (ip6, new_addr);
/* IP src/dst addr changed */
- if (has_ip6_address (&new_addr[VLIB_TX]))
- {
- sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[0]);
- sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[1]);
- sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[0]);
- sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[1]);
- }
+ sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[0]);
+ sum = ip_csum_add_even (sum, new_addr[VLIB_TX].as_u64[1]);
+ sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[0]);
+ sum = ip_csum_sub_even (sum, old_addr[VLIB_TX].as_u64[1]);
- if (has_ip6_address (&new_addr[VLIB_RX]))
- {
- sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[0]);
- sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[1]);
- sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[0]);
- sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[1]);
- }
+ sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[0]);
+ sum = ip_csum_add_even (sum, new_addr[VLIB_RX].as_u64[1]);
+ sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[0]);
+ sum = ip_csum_sub_even (sum, old_addr[VLIB_RX].as_u64[1]);
icmp->checksum = ip_csum_fold (sum);
}
static_always_inline void
-cnat_translation_ip6 (const cnat_session_t * session,
- ip6_header_t * ip6, udp_header_t * udp)
+cnat_translation_ip6 (const cnat_session_t *session, ip6_header_t *ip6,
+ udp_header_t *udp, u32 oflags)
{
tcp_header_t *tcp = (tcp_header_t *) udp;
ip6_address_t new_addr[VLIB_N_DIR];
@@ -652,7 +622,7 @@ cnat_translation_ip6 (const cnat_session_t * session,
if (ip6->protocol == IP_PROTOCOL_TCP)
{
ip_csum_t sum = tcp->checksum;
- cnat_ip6_translate_l4 (ip6, udp, &sum, new_addr, new_port);
+ cnat_ip6_translate_l4 (ip6, udp, &sum, new_addr, new_port, oflags);
tcp->checksum = ip_csum_fold (sum);
cnat_ip6_translate_l3 (ip6, new_addr);
cnat_tcp_update_session_lifetime (tcp, session->value.cs_ts_index);
@@ -660,7 +630,7 @@ cnat_translation_ip6 (const cnat_session_t * session,
else if (ip6->protocol == IP_PROTOCOL_UDP)
{
ip_csum_t sum = udp->checksum;
- cnat_ip6_translate_l4 (ip6, udp, &sum, new_addr, new_port);
+ cnat_ip6_translate_l4 (ip6, udp, &sum, new_addr, new_port, oflags);
udp->checksum = ip_csum_fold (sum);
cnat_ip6_translate_l3 (ip6, new_addr);
}
@@ -743,6 +713,18 @@ cnat_session_make_key (vlib_buffer_t *b, ip_address_family_t af,
session->key.cs_port[VLIB_RX] = udp->src_port;
session->key.cs_port[VLIB_TX] = udp->dst_port;
}
+ else if (ip4->protocol == IP_PROTOCOL_SCTP)
+ {
+ sctp_header_t *sctp;
+ sctp = (sctp_header_t *) (ip4 + 1);
+ ip46_address_set_ip4 (&session->key.cs_ip[VLIB_TX],
+ &ip4->dst_address);
+ ip46_address_set_ip4 (&session->key.cs_ip[VLIB_RX],
+ &ip4->src_address);
+ session->key.cs_proto = ip4->protocol;
+ session->key.cs_port[VLIB_RX] = sctp->src_port;
+ session->key.cs_port[VLIB_TX] = sctp->dst_port;
+ }
else
goto error;
}
@@ -837,20 +819,74 @@ cnat_load_balance (const cnat_translation_t *ct, ip_address_family_t af,
* rsession_location is the location the (return) session will be
* matched at
*/
+
+static_always_inline void
+cnat_session_create (cnat_session_t *session, cnat_node_ctx_t *ctx)
+{
+ cnat_bihash_kv_t *bkey = (cnat_bihash_kv_t *) session;
+
+ session->value.cs_ts_index = cnat_timestamp_new (ctx->now);
+ cnat_bihash_add_del (&cnat_session_db, bkey, 1);
+}
+
static_always_inline void
-cnat_session_create (cnat_session_t *session, cnat_node_ctx_t *ctx,
- cnat_session_location_t rsession_location,
- u8 rsession_flags)
+cnat_rsession_create (cnat_session_t *session, cnat_node_ctx_t *ctx,
+ cnat_session_location_t rsession_location,
+ cnat_session_flag_t rsession_flags)
{
cnat_client_t *cc;
cnat_bihash_kv_t rkey;
cnat_session_t *rsession = (cnat_session_t *) & rkey;
cnat_bihash_kv_t *bkey = (cnat_bihash_kv_t *) session;
- cnat_bihash_kv_t rvalue;
- int rv;
+ int rv, n_retries = 0;
+ static u32 sport_seed = 0;
- session->value.cs_ts_index = cnat_timestamp_new (ctx->now);
- cnat_bihash_add_del (&cnat_session_db, bkey, 1);
+ cnat_timestamp_inc_refcnt (session->value.cs_ts_index);
+
+ /* First create the return session */
+ ip46_address_copy (&rsession->key.cs_ip[VLIB_RX],
+ &session->value.cs_ip[VLIB_TX]);
+ ip46_address_copy (&rsession->key.cs_ip[VLIB_TX],
+ &session->value.cs_ip[VLIB_RX]);
+ rsession->key.cs_proto = session->key.cs_proto;
+ rsession->key.cs_loc = rsession_location;
+ rsession->key.__cs_pad = 0;
+ rsession->key.cs_af = ctx->af;
+ rsession->key.cs_port[VLIB_RX] = session->value.cs_port[VLIB_TX];
+ rsession->key.cs_port[VLIB_TX] = session->value.cs_port[VLIB_RX];
+
+ ip46_address_copy (&rsession->value.cs_ip[VLIB_RX],
+ &session->key.cs_ip[VLIB_TX]);
+ ip46_address_copy (&rsession->value.cs_ip[VLIB_TX],
+ &session->key.cs_ip[VLIB_RX]);
+ rsession->value.cs_ts_index = session->value.cs_ts_index;
+ rsession->value.cs_lbi = INDEX_INVALID;
+ rsession->value.flags = rsession_flags | CNAT_SESSION_IS_RETURN;
+ rsession->value.cs_port[VLIB_TX] = session->key.cs_port[VLIB_RX];
+ rsession->value.cs_port[VLIB_RX] = session->key.cs_port[VLIB_TX];
+
+retry_add_ression:
+ rv = cnat_bihash_add_del (&cnat_session_db, &rkey,
+ 2 /* add but don't overwrite */);
+ if (rv)
+ {
+ if (!(rsession_flags & CNAT_SESSION_RETRY_SNAT))
+ return;
+
+ /* return session add failed pick an new random src port */
+ rsession->value.cs_port[VLIB_TX] = session->key.cs_port[VLIB_RX] =
+ random_u32 (&sport_seed);
+ if (n_retries++ < 100)
+ goto retry_add_ression;
+ else
+ {
+ clib_warning ("Could not find a free port after 100 tries");
+ /* translate this packet, but don't create state */
+ return;
+ }
+ }
+
+ cnat_bihash_add_del (&cnat_session_db, bkey, 1 /* add */);
if (!(rsession_flags & CNAT_SESSION_FLAG_NO_CLIENT))
{
@@ -894,39 +930,6 @@ cnat_session_create (cnat_session_t *session, cnat_node_ctx_t *ctx,
}
}
- /* create the reverse flow key */
- ip46_address_copy (&rsession->key.cs_ip[VLIB_RX],
- &session->value.cs_ip[VLIB_TX]);
- ip46_address_copy (&rsession->key.cs_ip[VLIB_TX],
- &session->value.cs_ip[VLIB_RX]);
- rsession->key.cs_proto = session->key.cs_proto;
- rsession->key.cs_loc = rsession_location;
- rsession->key.__cs_pad = 0;
- rsession->key.cs_af = ctx->af;
- rsession->key.cs_port[VLIB_RX] = session->value.cs_port[VLIB_TX];
- rsession->key.cs_port[VLIB_TX] = session->value.cs_port[VLIB_RX];
-
- /* First search for existing reverse session */
- rv = cnat_bihash_search_i2 (&cnat_session_db, &rkey, &rvalue);
- if (!rv)
- {
- /* Reverse session already exists
- cleanup before creating for refcnts */
- cnat_session_t *found_rsession = (cnat_session_t *) & rvalue;
- cnat_session_free (found_rsession);
- }
- /* add the reverse flow */
- ip46_address_copy (&rsession->value.cs_ip[VLIB_RX],
- &session->key.cs_ip[VLIB_TX]);
- ip46_address_copy (&rsession->value.cs_ip[VLIB_TX],
- &session->key.cs_ip[VLIB_RX]);
- rsession->value.cs_ts_index = session->value.cs_ts_index;
- rsession->value.cs_lbi = INDEX_INVALID;
- rsession->value.flags = rsession_flags | CNAT_SESSION_IS_RETURN;
- rsession->value.cs_port[VLIB_TX] = session->key.cs_port[VLIB_RX];
- rsession->value.cs_port[VLIB_RX] = session->key.cs_port[VLIB_TX];
-
- cnat_bihash_add_del (&cnat_session_db, &rkey, 1);
}
always_inline uword
diff --git a/src/plugins/cnat/cnat_node_feature.c b/src/plugins/cnat/cnat_node_feature.c
index aced4cd0a15..9b2c0c2fe06 100644
--- a/src/plugins/cnat/cnat_node_feature.c
+++ b/src/plugins/cnat/cnat_node_feature.c
@@ -143,7 +143,10 @@ cnat_input_feature_fn (vlib_main_t *vm, vlib_node_runtime_t *node,
/* refcnt session in current client */
cnat_client_cnt_session (cc);
- cnat_session_create (session, ctx, CNAT_LOCATION_OUTPUT, rsession_flags);
+ cnat_session_create (session, ctx);
+ if (!(ct->flags & CNAT_TR_FLAG_NO_RETURN_SESSION))
+ cnat_rsession_create (session, ctx, CNAT_LOCATION_OUTPUT,
+ rsession_flags);
trace_flags |= CNAT_TRACE_SESSION_CREATED;
}
@@ -156,9 +159,9 @@ cnat_input_feature_fn (vlib_main_t *vm, vlib_node_runtime_t *node,
}
if (AF_IP4 == ctx->af)
- cnat_translation_ip4 (session, ip4, udp0);
+ cnat_translation_ip4 (session, ip4, udp0, vnet_buffer (b)->oflags);
else
- cnat_translation_ip6 (session, ip6, udp0);
+ cnat_translation_ip6 (session, ip6, udp0, vnet_buffer (b)->oflags);
if (NULL != ct)
{
@@ -320,14 +323,17 @@ cnat_output_feature_fn (vlib_main_t *vm, vlib_node_runtime_t *node,
CNAT_SESSION_FLAG_NO_CLIENT | CNAT_SESSION_FLAG_ALLOC_PORT;
trace_flags |= CNAT_TRACE_SESSION_CREATED;
- cnat_session_create (session, ctx, CNAT_LOCATION_INPUT,
- CNAT_SESSION_FLAG_NO_CLIENT);
+
+ cnat_session_create (session, ctx);
+ cnat_rsession_create (session, ctx, CNAT_LOCATION_INPUT,
+ CNAT_SESSION_FLAG_NO_CLIENT |
+ CNAT_SESSION_RETRY_SNAT);
}
if (AF_IP4 == ctx->af)
- cnat_translation_ip4 (session, ip4, udp0);
+ cnat_translation_ip4 (session, ip4, udp0, vnet_buffer (b)->oflags);
else
- cnat_translation_ip6 (session, ip6, udp0);
+ cnat_translation_ip6 (session, ip6, udp0, vnet_buffer (b)->oflags);
trace:
if (PREDICT_FALSE (ctx->do_trace))
diff --git a/src/plugins/cnat/cnat_node_snat.c b/src/plugins/cnat/cnat_node_snat.c
index 9212d67ead6..57530eb397d 100644
--- a/src/plugins/cnat/cnat_node_snat.c
+++ b/src/plugins/cnat/cnat_node_snat.c
@@ -129,15 +129,15 @@ cnat_snat_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node,
CNAT_SESSION_FLAG_NO_CLIENT | CNAT_SESSION_FLAG_ALLOC_PORT;
trace_flags |= CNAT_TRACE_SESSION_CREATED;
- cnat_session_create (session, ctx, CNAT_LOCATION_FIB,
- CNAT_SESSION_FLAG_HAS_SNAT);
+ cnat_session_create (session, ctx);
+ cnat_rsession_create (session, ctx, CNAT_LOCATION_FIB,
+ CNAT_SESSION_FLAG_HAS_SNAT);
}
-
if (AF_IP4 == ctx->af)
- cnat_translation_ip4 (session, ip4, udp0);
+ cnat_translation_ip4 (session, ip4, udp0, vnet_buffer (b)->oflags);
else
- cnat_translation_ip6 (session, ip6, udp0);
+ cnat_translation_ip6 (session, ip6, udp0, vnet_buffer (b)->oflags);
trace:
if (PREDICT_FALSE (b->flags & VLIB_BUFFER_IS_TRACED))
diff --git a/src/plugins/cnat/cnat_node_vip.c b/src/plugins/cnat/cnat_node_vip.c
index f166bd4f194..d320746c5fa 100644
--- a/src/plugins/cnat/cnat_node_vip.c
+++ b/src/plugins/cnat/cnat_node_vip.c
@@ -168,7 +168,9 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b,
/* refcnt session in current client */
cnat_client_cnt_session (cc);
- cnat_session_create (session, ctx, CNAT_LOCATION_FIB, rsession_flags);
+ cnat_session_create (session, ctx);
+ if (!(ct->flags & CNAT_TR_FLAG_NO_RETURN_SESSION))
+ cnat_rsession_create (session, ctx, CNAT_LOCATION_FIB, rsession_flags);
trace_flags |= CNAT_TRACE_SESSION_CREATED;
next0 = ct->ct_lb.dpoi_next_node;
@@ -176,9 +178,9 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b,
}
if (AF_IP4 == ctx->af)
- cnat_translation_ip4 (session, ip4, udp0);
+ cnat_translation_ip4 (session, ip4, udp0, vnet_buffer (b)->oflags);
else
- cnat_translation_ip6 (session, ip6, udp0);
+ cnat_translation_ip6 (session, ip6, udp0, vnet_buffer (b)->oflags);
if (NULL != ct)
{
diff --git a/src/plugins/cnat/cnat_scanner.c b/src/plugins/cnat/cnat_scanner.c
index b3591f7e8b0..2f982711581 100644
--- a/src/plugins/cnat/cnat_scanner.c
+++ b/src/plugins/cnat/cnat_scanner.c
@@ -14,6 +14,7 @@
*/
#include <cnat/cnat_session.h>
+#include <vlibmemory/api.h>
#include <cnat/cnat_client.h>
static uword
diff --git a/src/plugins/cnat/cnat_session.c b/src/plugins/cnat/cnat_session.c
index 216d2575c37..0f1cd43f501 100644
--- a/src/plugins/cnat/cnat_session.c
+++ b/src/plugins/cnat/cnat_session.c
@@ -94,7 +94,8 @@ format_cnat_session (u8 * s, va_list * args)
cnat_session_t *sess = va_arg (*args, cnat_session_t *);
CLIB_UNUSED (int verbose) = va_arg (*args, int);
f64 ts = 0;
- if (!pool_is_free_index (cnat_timestamps, sess->value.cs_ts_index))
+
+ if (!cnat_ts_is_free_index (sess->value.cs_ts_index))
ts = cnat_timestamp_exp (sess->value.cs_ts_index);
s = format (
@@ -172,15 +173,43 @@ cnat_session_purge (void)
return (0);
}
+void
+cnat_reverse_session_free (cnat_session_t *session)
+{
+ cnat_bihash_kv_t bkey, bvalue;
+ cnat_session_t *rsession = (cnat_session_t *) &bkey;
+ int rv;
+
+ ip46_address_copy (&rsession->key.cs_ip[VLIB_RX],
+ &session->value.cs_ip[VLIB_TX]);
+ ip46_address_copy (&rsession->key.cs_ip[VLIB_TX],
+ &session->value.cs_ip[VLIB_RX]);
+ rsession->key.cs_proto = session->key.cs_proto;
+ rsession->key.cs_loc = session->key.cs_loc == CNAT_LOCATION_OUTPUT ?
+ CNAT_LOCATION_INPUT :
+ CNAT_LOCATION_OUTPUT;
+ rsession->key.__cs_pad = 0;
+ rsession->key.cs_af = session->key.cs_af;
+ rsession->key.cs_port[VLIB_RX] = session->value.cs_port[VLIB_TX];
+ rsession->key.cs_port[VLIB_TX] = session->value.cs_port[VLIB_RX];
+
+ rv = cnat_bihash_search_i2 (&cnat_session_db, &bkey, &bvalue);
+ if (!rv)
+ {
+ /* other session is in bihash */
+ cnat_session_t *rsession = (cnat_session_t *) &bvalue;
+ cnat_session_free (rsession);
+ }
+}
+
u64
cnat_session_scan (vlib_main_t * vm, f64 start_time, int i)
{
BVT (clib_bihash) * h = &cnat_session_db;
int j, k;
- /* Don't scan the l2 fib if it hasn't been instantiated yet */
if (alloc_arena (h) == 0)
- return 0.0;
+ return 0;
for ( /* caller saves starting point */ ; i < h->nbuckets; i++)
{
@@ -210,7 +239,7 @@ cnat_session_scan (vlib_main_t * vm, f64 start_time, int i)
{
for (k = 0; k < BIHASH_KVP_PER_PAGE; k++)
{
- if (v->kvp[k].key[0] == ~0ULL && v->kvp[k].value[0] == ~0ULL)
+ if (BV (clib_bihash_is_free) (&v->kvp[k]))
continue;
cnat_session_t *session = (cnat_session_t *) & v->kvp[k];
@@ -219,6 +248,9 @@ cnat_session_scan (vlib_main_t * vm, f64 start_time, int i)
cnat_timestamp_exp (session->value.cs_ts_index))
{
/* age it */
+ cnat_reverse_session_free (session);
+ /* this should be last as deleting the session memset it to
+ * 0xff */
cnat_session_free (session);
/*
@@ -248,6 +280,12 @@ cnat_session_init (vlib_main_t * vm)
cm->session_hash_memory);
BV (clib_bihash_set_kvp_format_fn) (&cnat_session_db, format_cnat_session);
+ cnat_timestamps.next_empty_pool_idx = 0;
+ clib_bitmap_alloc (cnat_timestamps.ts_free, 1 << CNAT_TS_MPOOL_BITS);
+ clib_bitmap_set_region (cnat_timestamps.ts_free, 0, 1,
+ 1 << CNAT_TS_MPOOL_BITS);
+ clib_spinlock_init (&cnat_timestamps.ts_lock);
+
return (NULL);
}
@@ -258,21 +296,38 @@ cnat_timestamp_show (vlib_main_t * vm,
unformat_input_t * input, vlib_cli_command_t * cmd)
{
cnat_timestamp_t *ts;
- clib_rwlock_reader_lock (&cnat_main.ts_lock);
- pool_foreach (ts, cnat_timestamps)
+ int ts_cnt = 0, cnt;
+ u8 verbose = 0;
+ while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
{
- vlib_cli_output (vm, "[%d] last_seen:%f lifetime:%u ref:%u",
- ts - cnat_timestamps, ts->last_seen, ts->lifetime,
- ts->refcnt);
+ if (unformat (input, "verbose"))
+ verbose = 1;
+ else
+ return (clib_error_return (0, "unknown input '%U'",
+ format_unformat_error, input));
+ }
+
+ for (int i = 0; i < cnat_timestamps.next_empty_pool_idx; i++)
+ {
+ cnt = pool_elts (cnat_timestamps.ts_pools[i]);
+ ts_cnt += cnt;
+ vlib_cli_output (vm, "-- Pool %d [%d/%d]", i, cnt,
+ pool_header (cnat_timestamps.ts_pools[i])->max_elts);
+ if (!verbose)
+ continue;
+ pool_foreach (ts, cnat_timestamps.ts_pools[i])
+ vlib_cli_output (vm, "[%d] last_seen:%f lifetime:%u ref:%u",
+ ts - cnat_timestamps.ts_pools[i], ts->last_seen,
+ ts->lifetime, ts->refcnt);
}
- clib_rwlock_reader_unlock (&cnat_main.ts_lock);
+ vlib_cli_output (vm, "Total timestamps %d", ts_cnt);
return (NULL);
}
VLIB_CLI_COMMAND (cnat_timestamp_show_cmd, static) = {
.path = "show cnat timestamp",
.function = cnat_timestamp_show,
- .short_help = "show cnat timestamp",
+ .short_help = "show cnat timestamp [verbose]",
.is_mp_safe = 1,
};
diff --git a/src/plugins/cnat/cnat_session.h b/src/plugins/cnat/cnat_session.h
index 072bb10f96f..a0a28c9a818 100644
--- a/src/plugins/cnat/cnat_session.h
+++ b/src/plugins/cnat/cnat_session.h
@@ -129,6 +129,11 @@ typedef enum cnat_session_flag_t_
/* Debug flag marking return sessions */
CNAT_SESSION_IS_RETURN = (1 << 4),
+
+ /** On conflicts when adding the return session, try to sNAT the
+ * forward session, and dNAT the return session with a random port */
+ CNAT_SESSION_RETRY_SNAT = (1 << 5),
+
} cnat_session_flag_t;
typedef enum cnat_session_location_t_
diff --git a/src/plugins/cnat/cnat_snat_policy.c b/src/plugins/cnat/cnat_snat_policy.c
index d59156f34c8..cd9bfef492a 100644
--- a/src/plugins/cnat/cnat_snat_policy.c
+++ b/src/plugins/cnat/cnat_snat_policy.c
@@ -29,6 +29,8 @@ unformat_cnat_snat_interface_map_type (unformat_input_t *input, va_list *args)
*a = CNAT_SNAT_IF_MAP_INCLUDE_V6;
else if (unformat (input, "k8s"))
*a = CNAT_SNAT_IF_MAP_INCLUDE_POD;
+ else if (unformat (input, "host"))
+ *a = CNAT_SNAT_IF_MAP_INCLUDE_HOST;
else
return 0;
return 1;
@@ -49,6 +51,9 @@ format_cnat_snat_interface_map_type (u8 *s, va_list *args)
case CNAT_SNAT_IF_MAP_INCLUDE_POD:
s = format (s, "k8s pod");
break;
+ case CNAT_SNAT_IF_MAP_INCLUDE_HOST:
+ s = format (s, "k8s host");
+ break;
default:
s = format (s, "(unknown)");
break;
@@ -108,7 +113,7 @@ cnat_snat_policy_add_del_if_command_fn (vlib_main_t *vm,
vnet_main_t *vnm = vnet_get_main ();
int is_add = 1;
u32 sw_if_index = ~0;
- u32 table;
+ u32 table = 0;
int rv;
while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
@@ -296,6 +301,14 @@ cnat_snat_policy_k8s (vlib_buffer_t *b, cnat_session_t *session)
u32 in_if = vnet_buffer (b)->sw_if_index[VLIB_RX];
u32 out_if = vnet_buffer (b)->sw_if_index[VLIB_TX];
+ /* we should never snat traffic that we punt to the host, pass traffic as it
+ * is for us */
+ if (clib_bitmap_get (cpm->interface_maps[CNAT_SNAT_IF_MAP_INCLUDE_HOST],
+ out_if))
+ {
+ return 0;
+ }
+
/* source nat for outgoing connections */
if (cnat_snat_policy_interface_enabled (in_if, af))
if (cnat_search_snat_prefix (dst_addr, af))
diff --git a/src/plugins/cnat/cnat_snat_policy.h b/src/plugins/cnat/cnat_snat_policy.h
index 987ae494e16..61c2382602f 100644
--- a/src/plugins/cnat/cnat_snat_policy.h
+++ b/src/plugins/cnat/cnat_snat_policy.h
@@ -45,6 +45,9 @@ typedef enum cnat_snat_interface_map_type_t_
CNAT_SNAT_IF_MAP_INCLUDE_V4 = AF_IP4,
CNAT_SNAT_IF_MAP_INCLUDE_V6 = AF_IP6,
CNAT_SNAT_IF_MAP_INCLUDE_POD,
+ /* CNAT_SNAT_IF_MAP_INCLUDE_HOST is used for interfaces used for punt,
+ replicating uplink */
+ CNAT_SNAT_IF_MAP_INCLUDE_HOST,
CNAT_N_SNAT_IF_MAP,
} cnat_snat_interface_map_type_t;
diff --git a/src/plugins/cnat/cnat_src_policy.c b/src/plugins/cnat/cnat_src_policy.c
index cac24b7742c..8f3f3375148 100644
--- a/src/plugins/cnat/cnat_src_policy.c
+++ b/src/plugins/cnat/cnat_src_policy.c
@@ -59,8 +59,8 @@ cnat_vip_default_source_policy (vlib_main_t * vm,
u16 sport;
sport = udp0->src_port;
/* Allocate a port only if asked and if we actually sNATed */
- if ((ct->flags & CNAT_TRANSLATION_FLAG_ALLOCATE_PORT)
- && (*rsession_flags & CNAT_SESSION_FLAG_HAS_SNAT))
+ if ((ct->flags & CNAT_TR_FLAG_ALLOCATE_PORT) &&
+ (*rsession_flags & CNAT_SESSION_FLAG_HAS_SNAT))
{
sport = 0; /* force allocation */
session->value.flags |= CNAT_SESSION_FLAG_ALLOC_PORT;
diff --git a/src/plugins/cnat/cnat_translation.c b/src/plugins/cnat/cnat_translation.c
index 049809a8684..513cedf0446 100644
--- a/src/plugins/cnat/cnat_translation.c
+++ b/src/plugins/cnat/cnat_translation.c
@@ -18,8 +18,10 @@
#include <vnet/fib/fib_entry_track.h>
#include <vnet/dpo/load_balance.h>
#include <vnet/dpo/drop_dpo.h>
+#include <vnet/dpo/dpo.h>
#include <cnat/cnat_translation.h>
+#include <cnat/cnat_maglev.h>
#include <cnat/cnat_session.h>
#include <cnat/cnat_client.h>
@@ -82,6 +84,7 @@ cnat_tracker_release (cnat_ep_trk_t * trk)
/* We only track fully resolved endpoints */
if (!(trk->ct_flags & CNAT_TRK_ACTIVE))
return;
+ dpo_reset (&trk->ct_dpo); // undo fib_entry_contribute_forwarding
fib_entry_untrack (trk->ct_fei, trk->ct_sibling);
}
@@ -200,110 +203,7 @@ cnat_remove_translation_from_db (index_t cci, cnat_endpoint_t * vip,
clib_bihash_add_del_8_8 (&cnat_translation_db, &bkey, 0);
}
-typedef struct
-{
- cnat_ep_trk_t *trk;
- u32 index;
- u32 offset;
- u32 skip;
-} cnat_maglev_entry_t;
-static int
-cnat_maglev_entry_compare (void *_a, void *_b)
-{
- cnat_ep_trk_t *a = ((cnat_maglev_entry_t *) _a)->trk;
- cnat_ep_trk_t *b = ((cnat_maglev_entry_t *) _b)->trk;
- int rv = 0;
- if ((rv =
- ip_address_cmp (&a->ct_ep[VLIB_TX].ce_ip, &b->ct_ep[VLIB_TX].ce_ip)))
- return rv;
- if ((rv = a->ct_ep[VLIB_TX].ce_port - a->ct_ep[VLIB_TX].ce_port))
- return rv;
- if ((rv =
- ip_address_cmp (&a->ct_ep[VLIB_RX].ce_ip, &b->ct_ep[VLIB_RX].ce_ip)))
- return rv;
- if ((rv = a->ct_ep[VLIB_RX].ce_port - a->ct_ep[VLIB_RX].ce_port))
- return rv;
- return 0;
-}
-
-static void
-cnat_translation_init_maglev (cnat_translation_t *ct)
-{
- cnat_maglev_entry_t *backends = NULL, *bk;
- cnat_main_t *cm = &cnat_main;
- u32 done = 0;
- cnat_ep_trk_t *trk;
- int ep_idx = 0;
-
- vec_foreach (trk, ct->ct_active_paths)
- {
- cnat_maglev_entry_t bk;
- u32 h1, h2;
-
- if (AF_IP4 == ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip))
- {
- u32 a, b, c;
- a = ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32;
- b = (u64) trk->ct_ep[VLIB_TX].ce_port << 16 |
- (u64) trk->ct_ep[VLIB_RX].ce_port;
- c = ip_addr_v4 (&trk->ct_ep[VLIB_RX].ce_ip).data_u32;
- hash_v3_mix32 (a, b, c);
- hash_v3_finalize32 (a, b, c);
- h1 = c;
- h2 = b;
- }
- else
- {
- u64 a, b, c;
- a = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[0] ^
- ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[1];
- b = (u64) trk->ct_ep[VLIB_TX].ce_port << 16 |
- (u64) trk->ct_ep[VLIB_RX].ce_port;
- c = ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[0] ^
- ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[1];
- hash_mix64 (a, b, c);
- h1 = c;
- h2 = b;
- }
-
- bk.offset = h1 % cm->maglev_len;
- bk.skip = h2 % (cm->maglev_len - 1) + 1;
- bk.index = ep_idx++;
- bk.trk = trk;
- vec_add1 (backends, bk);
- }
-
- if (0 == ep_idx)
- return;
-
- vec_sort_with_function (backends, cnat_maglev_entry_compare);
-
- /* Don't free if previous vector exists, just zero */
- vec_validate (ct->lb_maglev, cm->maglev_len);
- vec_set (ct->lb_maglev, -1);
-
- while (1)
- {
- vec_foreach (bk, backends)
- {
- u32 next = 0;
- u32 c = (bk->offset + next * bk->skip) % cm->maglev_len;
- while (ct->lb_maglev[c] != (u32) -1)
- {
- next++;
- c = (bk->offset + next * bk->skip) % cm->maglev_len;
- }
- ct->lb_maglev[c] = bk->index;
- done++;
- if (done == cm->maglev_len)
- goto finished;
- }
- }
-
-finished:
- vec_free (backends);
-}
static void
cnat_translation_stack (cnat_translation_t * ct)
@@ -323,8 +223,11 @@ cnat_translation_stack (cnat_translation_t * ct)
if (trk->ct_flags & CNAT_TRK_ACTIVE)
vec_add1 (ct->ct_active_paths, *trk);
+ flow_hash_config_t fhc = IP_FLOW_HASH_DEFAULT;
+ if (ct->fhc != 0)
+ fhc = ct->fhc;
lbi = load_balance_create (vec_len (ct->ct_active_paths),
- fib_proto_to_dpo (fproto), IP_FLOW_HASH_DEFAULT);
+ fib_proto_to_dpo (fproto), fhc);
ep_idx = 0;
vec_foreach (trk, ct->ct_active_paths)
@@ -335,7 +238,7 @@ cnat_translation_stack (cnat_translation_t * ct)
dpo_set (&ct->ct_lb, DPO_LOAD_BALANCE, dproto, lbi);
dpo_stack (cnat_client_dpo, dproto, &ct->ct_lb, &ct->ct_lb);
- ct->flags |= CNAT_TRANSLATION_STACKED;
+ ct->flags |= CNAT_TR_FLAG_STACKED;
}
int
@@ -365,8 +268,9 @@ cnat_translation_delete (u32 id)
u32
cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto,
cnat_endpoint_tuple_t *paths, u8 flags,
- cnat_lb_type_t lb_type)
+ cnat_lb_type_t lb_type, flow_hash_config_t fhc)
{
+ const dpo_id_t tmp = DPO_INVALID;
cnat_endpoint_tuple_t *path;
const cnat_client_t *cc;
cnat_translation_t *ct;
@@ -398,6 +302,7 @@ cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto,
ct->ct_cci = cci;
ct->index = ct - cnat_translation_pool;
ct->lb_type = lb_type;
+ ct->fhc = fhc;
cnat_add_translation_to_db (cci, vip, proto, ct->index);
cnat_client_translation_added (cci);
@@ -417,7 +322,7 @@ cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto,
}
vec_reset_length (ct->ct_paths);
- ct->flags &= ~CNAT_TRANSLATION_STACKED;
+ ct->flags &= ~CNAT_TR_FLAG_STACKED;
u64 path_idx = 0;
vec_foreach (path, paths)
@@ -438,6 +343,7 @@ cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto,
clib_memcpy (&trk->ct_ep[VLIB_RX], &path->src_ep,
sizeof (trk->ct_ep[VLIB_RX]));
trk->ct_flags = path->ep_flags;
+ trk->ct_dpo = tmp;
cnat_tracker_track (ct->index, trk);
}
@@ -486,6 +392,11 @@ format_cnat_translation (u8 * s, va_list * args)
format_ip_protocol, ct->ct_proto);
s = format (s, "lb:%U ", format_cnat_lb_type, ct->lb_type);
+ if ((ct->fhc == 0) || (ct->fhc == IP_FLOW_HASH_DEFAULT))
+ s = format (s, "fhc:0x%x(default)", IP_FLOW_HASH_DEFAULT);
+ else
+ s = format (s, "fhc:0x%x", ct->fhc);
+
vec_foreach (ck, ct->ct_paths)
s = format (s, "\n%U", format_cnat_ep_trk, ck, 2);
@@ -615,7 +526,7 @@ cnat_translation_back_walk_notify (fib_node_t * node,
/* If we have more than FIB_PATH_LIST_POPULAR paths
* we might get called during path tracking
* (cnat_tracker_track) */
- if (!(ct->flags & CNAT_TRANSLATION_STACKED))
+ if (!(ct->flags & CNAT_TR_FLAG_STACKED))
return (FIB_NODE_BACK_WALK_CONTINUE);
cnat_translation_stack (ct);
@@ -678,8 +589,9 @@ cnat_translation_cli_add_del (vlib_main_t * vm,
}
}
+ flow_hash_config_t fhc = 0;
if (INDEX_INVALID == del_index)
- cnat_translation_update (&vip, proto, paths, flags, lb_type);
+ cnat_translation_update (&vip, proto, paths, flags, lb_type, fhc);
else
cnat_translation_delete (del_index);
@@ -764,11 +676,11 @@ cnat_if_addr_add_del_backend_cb (addr_resolution_t * ar,
ep->ce_flags |= CNAT_EP_FLAG_RESOLVED;
}
- ct->flags &= ~CNAT_TRANSLATION_STACKED;
+ ct->flags &= ~CNAT_TR_FLAG_STACKED;
cnat_tracker_track (ar->cti, trk);
cnat_translation_stack (ct);
- ct->flags |= CNAT_TRANSLATION_STACKED;
+ ct->flags |= CNAT_TR_FLAG_STACKED;
}
static void
@@ -825,7 +737,7 @@ cnat_translation_init (vlib_main_t * vm)
ip6_main_t *i6m = &ip6_main;
cnat_main_t *cm = &cnat_main;
cnat_translation_fib_node_type =
- fib_node_register_new_type (&cnat_translation_vft);
+ fib_node_register_new_type ("cnat-translation", &cnat_translation_vft);
clib_bihash_init_8_8 (&cnat_translation_db, "CNat translation DB",
cm->translation_hash_buckets,
diff --git a/src/plugins/cnat/cnat_translation.h b/src/plugins/cnat/cnat_translation.h
index 97b0c908b42..9bb3455d9fe 100644
--- a/src/plugins/cnat/cnat_translation.h
+++ b/src/plugins/cnat/cnat_translation.h
@@ -60,12 +60,14 @@ typedef struct cnat_ep_trk_t_
typedef enum cnat_translation_flag_t_
{
/* Do allocate a source port */
- CNAT_TRANSLATION_FLAG_ALLOCATE_PORT = (1 << 0),
+ CNAT_TR_FLAG_ALLOCATE_PORT = (1 << 0),
/* Has this translation been satcked ?
* this allow not being called twice when
* with more then FIB_PATH_LIST_POPULAR backends */
- CNAT_TRANSLATION_STACKED = (1 << 1),
-} cnat_translation_flag_t;
+ CNAT_TR_FLAG_STACKED = (1 << 1),
+ /* Do not create a return session */
+ CNAT_TR_FLAG_NO_RETURN_SESSION = (1 << 2),
+} __clib_packed cnat_translation_flag_t;
typedef enum
{
@@ -76,11 +78,11 @@ typedef enum
CNAT_ADDR_N_RESOLUTIONS,
} cnat_addr_resol_type_t;
-typedef enum __attribute__ ((__packed__))
+typedef enum
{
CNAT_LB_DEFAULT,
CNAT_LB_MAGLEV,
-} cnat_lb_type_t;
+} __clib_packed cnat_lb_type_t;
/**
* Entry used to account for a translation's backend
@@ -160,13 +162,18 @@ typedef struct cnat_translation_t_
/**
* Translation flags
*/
- u8 flags;
+ cnat_translation_flag_t flags;
/**
* Type of load balancing
*/
cnat_lb_type_t lb_type;
+ /**
+ * Type of flow hash config
+ */
+ flow_hash_config_t fhc;
+
union
{
u32 *lb_maglev;
@@ -189,7 +196,8 @@ extern u8 *format_cnat_translation (u8 * s, va_list * args);
extern u32 cnat_translation_update (cnat_endpoint_t *vip,
ip_protocol_t ip_proto,
cnat_endpoint_tuple_t *backends, u8 flags,
- cnat_lb_type_t lb_type);
+ cnat_lb_type_t lb_type,
+ flow_hash_config_t fhc);
/**
* Delete a translation
diff --git a/src/plugins/cnat/cnat_types.c b/src/plugins/cnat/cnat_types.c
index 9b164c6069d..084a03da968 100644
--- a/src/plugins/cnat/cnat_types.c
+++ b/src/plugins/cnat/cnat_types.c
@@ -16,8 +16,7 @@
#include <cnat/cnat_types.h>
cnat_main_t cnat_main;
-fib_source_t cnat_fib_source;
-cnat_timestamp_t *cnat_timestamps;
+cnat_timestamp_mpool_t cnat_timestamps;
char *cnat_error_strings[] = {
#define cnat_error(n,s) s,
@@ -152,19 +151,6 @@ format_cnat_endpoint (u8 * s, va_list * args)
return (s);
}
-static clib_error_t *
-cnat_types_init (vlib_main_t * vm)
-{
- cnat_fib_source = fib_source_allocate ("cnat",
- CNAT_FIB_SOURCE_PRIORITY,
- FIB_SOURCE_BH_SIMPLE);
-
-
- clib_rwlock_init (&cnat_main.ts_lock);
-
- return (NULL);
-}
-
void
cnat_enable_disable_scanner (cnat_scanner_cmd_t event_type)
{
@@ -191,6 +177,8 @@ cnat_config (vlib_main_t * vm, unformat_input_t * input)
cm->session_hash_buckets = CNAT_DEFAULT_SESSION_BUCKETS;
cm->translation_hash_memory = CNAT_DEFAULT_TRANSLATION_MEMORY;
cm->translation_hash_buckets = CNAT_DEFAULT_TRANSLATION_BUCKETS;
+ cm->client_hash_memory = CNAT_DEFAULT_CLIENT_MEMORY;
+ cm->client_hash_buckets = CNAT_DEFAULT_CLIENT_BUCKETS;
cm->snat_hash_memory = CNAT_DEFAULT_SNAT_MEMORY;
cm->snat_hash_buckets = CNAT_DEFAULT_SNAT_BUCKETS;
cm->snat_if_map_length = CNAT_DEFAULT_SNAT_IF_MAP_LEN;
@@ -215,6 +203,12 @@ cnat_config (vlib_main_t * vm, unformat_input_t * input)
else if (unformat (input, "translation-db-memory %U",
unformat_memory_size, &cm->translation_hash_memory))
;
+ else if (unformat (input, "client-db-buckets %u",
+ &cm->client_hash_buckets))
+ ;
+ else if (unformat (input, "client-db-memory %U", unformat_memory_size,
+ &cm->client_hash_memory))
+ ;
else if (unformat (input, "snat-db-buckets %u", &cm->snat_hash_buckets))
;
else if (unformat (input, "snat-if-map-len %u", &cm->snat_if_map_length))
@@ -250,7 +244,6 @@ cnat_get_main ()
}
VLIB_EARLY_CONFIG_FUNCTION (cnat_config, "cnat");
-VLIB_INIT_FUNCTION (cnat_types_init);
/*
* fd.io coding-style-patch-verification: ON
diff --git a/src/plugins/cnat/cnat_types.h b/src/plugins/cnat/cnat_types.h
index c3ec74c345f..d229d21adae 100644
--- a/src/plugins/cnat/cnat_types.h
+++ b/src/plugins/cnat/cnat_types.h
@@ -36,12 +36,14 @@
#define CNAT_DEFAULT_SESSION_BUCKETS 1024
#define CNAT_DEFAULT_TRANSLATION_BUCKETS 1024
+#define CNAT_DEFAULT_CLIENT_BUCKETS 1024
#define CNAT_DEFAULT_SNAT_BUCKETS 1024
#define CNAT_DEFAULT_SNAT_IF_MAP_LEN 4096
#define CNAT_DEFAULT_SESSION_MEMORY (1 << 20)
#define CNAT_DEFAULT_TRANSLATION_MEMORY (256 << 10)
-#define CNAT_DEFAULT_SNAT_MEMORY (64 << 20)
+#define CNAT_DEFAULT_CLIENT_MEMORY (256 << 10)
+#define CNAT_DEFAULT_SNAT_MEMORY (64 << 10)
/* Should be prime >~ 100 * numBackends */
#define CNAT_DEFAULT_MAGLEV_LEN 1009
@@ -50,11 +52,24 @@
* from fib_source.h */
#define CNAT_FIB_SOURCE_PRIORITY 0x02
-/* Initial refcnt for timestamps (2 : session & rsession) */
-#define CNAT_TIMESTAMP_INIT_REFCNT 2
+/* Initial number of timestamps for a session
+ * this will be incremented when adding the reverse
+ * session in cnat_rsession_create */
+#define CNAT_TIMESTAMP_INIT_REFCNT 1
#define MIN_SRC_PORT ((u16) 0xC000)
+typedef struct
+{
+ /* Source and destination port. */
+ u16 src_port, dst_port;
+
+ /* Random value to distinguish connections. */
+ u32 verification_tag;
+
+ u32 checksum;
+} sctp_header_t;
+
typedef enum cnat_trk_flag_t_
{
/* Endpoint is active (static or dhcp resolved) */
@@ -62,6 +77,8 @@ typedef enum cnat_trk_flag_t_
/* Don't translate this endpoint, but still
* forward. Used by maglev for DSR */
CNAT_TRK_FLAG_NO_NAT = (1 << 1),
+ /* */
+ CNAT_TRK_FLAG_TEST_DISABLED = (1 << 7),
} cnat_trk_flag_t;
typedef enum
@@ -105,6 +122,12 @@ typedef struct cnat_main_
/* Number of buckets of the translation bihash */
u32 translation_hash_buckets;
+ /* Memory size of the client bihash */
+ uword client_hash_memory;
+
+ /* Number of buckets of the client bihash */
+ u32 client_hash_buckets;
+
/* Memory size of the source NAT prefix bihash */
uword snat_hash_memory;
@@ -125,9 +148,6 @@ typedef struct cnat_main_
/* delay in seconds between two scans of session/clients tables */
f64 scanner_timeout;
- /* Lock for the timestamp pool */
- clib_rwlock_t ts_lock;
-
/* Index of the scanner process node */
uword scanner_node_index;
@@ -152,6 +172,23 @@ typedef struct cnat_timestamp_t_
u16 refcnt;
} cnat_timestamp_t;
+/* Create the first pool with 1 << CNAT_TS_BASE_SIZE elts */
+#define CNAT_TS_BASE_SIZE (8)
+/* reserve the top CNAT_TS_MPOOL_BITS bits for finding the pool */
+#define CNAT_TS_MPOOL_BITS (6)
+
+typedef struct cnat_timestamp_mpool_t_
+{
+ /* Increasing fixed size pools of timestamps */
+ cnat_timestamp_t *ts_pools[1 << CNAT_TS_MPOOL_BITS];
+ /* Bitmap of pools with free space */
+ uword *ts_free;
+ /* Index of next pool to init */
+ u8 next_empty_pool_idx;
+ /* ts creation lock */
+ clib_spinlock_t ts_lock;
+} cnat_timestamp_mpool_t;
+
typedef struct cnat_node_ctx_
{
f64 now;
@@ -165,8 +202,7 @@ extern u8 *format_cnat_endpoint (u8 * s, va_list * args);
extern uword unformat_cnat_ep_tuple (unformat_input_t * input,
va_list * args);
extern uword unformat_cnat_ep (unformat_input_t * input, va_list * args);
-extern cnat_timestamp_t *cnat_timestamps;
-extern fib_source_t cnat_fib_source;
+extern cnat_timestamp_mpool_t cnat_timestamps;
extern cnat_main_t cnat_main;
extern char *cnat_error_strings[];