aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/plugins/cnat/cnat.api18
-rw-r--r--src/plugins/cnat/cnat_api.c8
-rw-r--r--src/plugins/cnat/cnat_node.h29
-rw-r--r--src/plugins/cnat/cnat_node_vip.c19
-rw-r--r--src/plugins/cnat/cnat_session.h2
-rw-r--r--src/plugins/cnat/cnat_translation.c168
-rw-r--r--src/plugins/cnat/cnat_translation.h23
-rw-r--r--src/plugins/cnat/cnat_types.c3
-rw-r--r--src/plugins/cnat/cnat_types.h8
9 files changed, 253 insertions, 25 deletions
diff --git a/src/plugins/cnat/cnat.api b/src/plugins/cnat/cnat.api
index 685f9e17146..2b79e0d1b8b 100644
--- a/src/plugins/cnat/cnat.api
+++ b/src/plugins/cnat/cnat.api
@@ -19,7 +19,7 @@
used to control the ABF plugin
*/
-option version = "0.1.0";
+option version = "0.2.0";
import "vnet/ip/ip_types.api";
import "vnet/fib/fib_types.api";
import "vnet/interface_types.api";
@@ -29,6 +29,20 @@ enum cnat_translation_flags:u8
CNAT_TRANSLATION_ALLOC_PORT = 1,
};
+enum cnat_endpoint_tuple_flags:u8
+{
+ /* Dont translate said endpoint tuple but
+ * still forward */
+ CNAT_EPT_NO_NAT = 1,
+};
+
+
+enum cnat_lb_type:u8
+{
+ CNAT_LB_TYPE_DEFAULT = 0,
+ CNAT_LB_TYPE_MAGLEV = 1,
+};
+
/* An enpoint is either
* An IP & a port
* An interface, an address familiy and a port */
@@ -44,6 +58,7 @@ typedef cnat_endpoint_tuple
{
vl_api_cnat_endpoint_t dst_ep;
vl_api_cnat_endpoint_t src_ep;
+ u8 flags;
};
typedef cnat_translation
@@ -53,6 +68,7 @@ typedef cnat_translation
vl_api_ip_proto_t ip_proto;
u8 is_real_ip;
u8 flags;
+ vl_api_cnat_lb_type_t lb_type;
u32 n_paths;
vl_api_cnat_endpoint_tuple_t paths[n_paths];
};
diff --git a/src/plugins/cnat/cnat_api.c b/src/plugins/cnat/cnat_api.c
index 1c6ef7b6cf4..99d9c729282 100644
--- a/src/plugins/cnat/cnat_api.c
+++ b/src/plugins/cnat/cnat_api.c
@@ -67,6 +67,7 @@ cnat_endpoint_tuple_decode (const vl_api_cnat_endpoint_tuple_t * in,
if (rv)
return rv;
rv = cnat_endpoint_decode (&in->dst_ep, &out->dst_ep);
+ out->ep_flags = in->flags;
return rv;
}
@@ -95,6 +96,7 @@ vl_api_cnat_translation_update_t_handler (vl_api_cnat_translation_update_t
u8 flags;
int rv = 0;
u32 pi, n_paths;
+ cnat_lb_type_t lb_type;
rv = ip_proto_decode (mp->translation.ip_proto, &ip_proto);
@@ -119,7 +121,9 @@ vl_api_cnat_translation_update_t_handler (vl_api_cnat_translation_update_t
flags = mp->translation.flags;
if (!mp->translation.is_real_ip)
flags |= CNAT_FLAG_EXCLUSIVE;
- id = cnat_translation_update (&vip, ip_proto, paths, flags);
+
+ lb_type = (cnat_lb_type_t) mp->translation.lb_type;
+ id = cnat_translation_update (&vip, ip_proto, paths, flags, lb_type);
vec_free (paths);
@@ -172,12 +176,14 @@ cnat_translation_send_details (u32 cti, void *args)
mp->translation.id = clib_host_to_net_u32 (cti);
cnat_endpoint_encode (&ct->ct_vip, &mp->translation.vip);
mp->translation.ip_proto = ip_proto_encode (ct->ct_proto);
+ mp->translation.lb_type = (vl_api_cnat_lb_type_t) ct->lb_type;
path = mp->translation.paths;
vec_foreach (trk, ct->ct_paths)
{
cnat_endpoint_encode (&trk->ct_ep[VLIB_TX], &path->dst_ep);
cnat_endpoint_encode (&trk->ct_ep[VLIB_RX], &path->src_ep);
+ path->flags = trk->ct_flags;
path++;
}
diff --git a/src/plugins/cnat/cnat_node.h b/src/plugins/cnat/cnat_node.h
index 56a6c612e1b..157287b0cab 100644
--- a/src/plugins/cnat/cnat_node.h
+++ b/src/plugins/cnat/cnat_node.h
@@ -803,6 +803,35 @@ error:
return;
}
+static_always_inline cnat_ep_trk_t *
+cnat_load_balance (const cnat_translation_t *ct, ip_address_family_t af,
+ ip4_header_t *ip4, ip6_header_t *ip6, u32 *dpoi_index)
+{
+ cnat_main_t *cm = &cnat_main;
+ const load_balance_t *lb0;
+ const dpo_id_t *dpo0;
+ u32 hash_c0, bucket0;
+
+ lb0 = load_balance_get (ct->ct_lb.dpoi_index);
+ if (PREDICT_FALSE (!lb0->lb_n_buckets))
+ return (NULL);
+
+ /* session table miss */
+ hash_c0 = (AF_IP4 == af ? ip4_compute_flow_hash (ip4, lb0->lb_hash_config) :
+ ip6_compute_flow_hash (ip6, lb0->lb_hash_config));
+
+ if (PREDICT_FALSE (ct->lb_type == CNAT_LB_MAGLEV))
+ bucket0 = ct->lb_maglev[hash_c0 % cm->maglev_len];
+ else
+ bucket0 = hash_c0 % lb0->lb_n_buckets;
+
+ dpo0 = load_balance_get_fwd_bucket (lb0, bucket0);
+
+ *dpoi_index = dpo0->dpoi_index;
+
+ return &ct->ct_active_paths[bucket0];
+}
+
/**
* Create NAT sessions
* rsession_location is the location the (return) session will be
diff --git a/src/plugins/cnat/cnat_node_vip.c b/src/plugins/cnat/cnat_node_vip.c
index f653aa1e430..bc7d30369ab 100644
--- a/src/plugins/cnat/cnat_node_vip.c
+++ b/src/plugins/cnat/cnat_node_vip.c
@@ -109,14 +109,12 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b,
}
/* New flow, create the sessions */
- const load_balance_t *lb0;
cnat_ep_trk_t *trk0;
- u32 hash_c0, bucket0;
u32 rsession_flags = 0;
- const dpo_id_t *dpo0;
+ u32 dpoi_index = -1;
- lb0 = load_balance_get (ct->ct_lb.dpoi_index);
- if (!lb0->lb_n_buckets)
+ trk0 = cnat_load_balance (ct, ctx->af, ip4, ip6, &dpoi_index);
+ if (PREDICT_FALSE (NULL == trk0))
{
/* Dont translate & Follow the fib programming */
vnet_buffer (b)->ip.adj_index[VLIB_TX] = cc->cc_parent.dpoi_index;
@@ -124,16 +122,7 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b,
goto trace;
}
- /* session table miss */
- hash_c0 = (AF_IP4 == ctx->af ?
- ip4_compute_flow_hash (ip4, lb0->lb_hash_config) :
- ip6_compute_flow_hash (ip6, lb0->lb_hash_config));
- bucket0 = hash_c0 % lb0->lb_n_buckets;
- dpo0 = load_balance_get_fwd_bucket (lb0, bucket0);
-
/* add the session */
- trk0 = &ct->ct_paths[bucket0];
-
ip46_address_copy (&session->value.cs_ip[VLIB_TX],
&trk0->ct_ep[VLIB_TX].ce_ip.ip);
if (ip_address_is_zero (&trk0->ct_ep[VLIB_RX].ce_ip))
@@ -158,7 +147,7 @@ cnat_vip_node_fn (vlib_main_t *vm, vlib_node_runtime_t *node, vlib_buffer_t *b,
clib_host_to_net_u16 (trk0->ct_ep[VLIB_RX].ce_port);
session->value.dpoi_next_node = ct->ct_lb.dpoi_next_node;
- session->value.cs_lbi = dpo0->dpoi_index;
+ session->value.cs_lbi = dpoi_index;
rv = cspm->vip_policy (vm, b, session, &rsession_flags, ct, ctx);
if (CNAT_SOURCE_ERROR_USE_DEFAULT == rv)
diff --git a/src/plugins/cnat/cnat_session.h b/src/plugins/cnat/cnat_session.h
index 540a2f29409..51764504bca 100644
--- a/src/plugins/cnat/cnat_session.h
+++ b/src/plugins/cnat/cnat_session.h
@@ -122,6 +122,8 @@ typedef enum cnat_session_flag_t_
* This session doesn't have a client, do not attempt to free it
*/
CNAT_SESSION_FLAG_NO_CLIENT = (1 << 2),
+
+ CNAT_SESSION_FLAG_NO_NAT = (1 << 3),
} cnat_session_flag_t;
typedef enum cnat_session_location_t_
diff --git a/src/plugins/cnat/cnat_translation.c b/src/plugins/cnat/cnat_translation.c
index 65c44d80b19..8b7cf2451b7 100644
--- a/src/plugins/cnat/cnat_translation.c
+++ b/src/plugins/cnat/cnat_translation.c
@@ -113,6 +113,32 @@ cnat_tracker_track (index_t cti, cnat_ep_trk_t * trk)
(pfx.fp_proto), &trk->ct_dpo);
}
+u8 *
+format_cnat_lb_type (u8 *s, va_list *args)
+{
+ cnat_lb_type_t lb_type = va_arg (*args, int);
+ if (CNAT_LB_DEFAULT == lb_type)
+ s = format (s, "default");
+ else if (CNAT_LB_MAGLEV == lb_type)
+ s = format (s, "maglev");
+ else
+ s = format (s, "unknown");
+ return (s);
+}
+
+uword
+unformat_cnat_lb_type (unformat_input_t *input, va_list *args)
+{
+ cnat_lb_type_t *a = va_arg (*args, cnat_lb_type_t *);
+ if (unformat (input, "default"))
+ *a = CNAT_LB_DEFAULT;
+ else if (unformat (input, "maglev"))
+ *a = CNAT_LB_MAGLEV;
+ else
+ return 0;
+ return 1;
+}
+
/**
* Add a translation to the bihash
*
@@ -177,6 +203,109 @@ cnat_remove_translation_from_db (index_t cci, cnat_endpoint_t * vip,
clib_bihash_add_del_8_8 (&cnat_translation_db, &bkey, 0);
}
+typedef struct
+{
+ cnat_ep_trk_t *trk;
+ u32 index;
+ u32 offset;
+ u32 skip;
+} cnat_maglev_entry_t;
+
+static int
+cnat_maglev_entry_compare (void *_a, void *_b)
+{
+ cnat_ep_trk_t *a = ((cnat_maglev_entry_t *) _a)->trk;
+ cnat_ep_trk_t *b = ((cnat_maglev_entry_t *) _b)->trk;
+ int rv = 0;
+ if ((rv =
+ ip_address_cmp (&a->ct_ep[VLIB_TX].ce_ip, &b->ct_ep[VLIB_TX].ce_ip)))
+ return rv;
+ if ((rv = a->ct_ep[VLIB_TX].ce_port - a->ct_ep[VLIB_TX].ce_port))
+ return rv;
+ if ((rv =
+ ip_address_cmp (&a->ct_ep[VLIB_RX].ce_ip, &b->ct_ep[VLIB_RX].ce_ip)))
+ return rv;
+ if ((rv = a->ct_ep[VLIB_RX].ce_port - a->ct_ep[VLIB_RX].ce_port))
+ return rv;
+ return 0;
+}
+
+static void
+cnat_translation_init_maglev (cnat_translation_t *ct)
+{
+ cnat_maglev_entry_t *backends = NULL, *bk;
+ cnat_main_t *cm = &cnat_main;
+ u32 done = 0;
+ cnat_ep_trk_t *trk;
+ int ep_idx = 0;
+
+ vec_foreach (trk, ct->ct_active_paths)
+ {
+ cnat_maglev_entry_t bk;
+ u32 h1, h2;
+
+ if (AF_IP4 == ip_addr_version (&trk->ct_ep[VLIB_TX].ce_ip))
+ {
+ u32 a, b, c;
+ a = ip_addr_v4 (&trk->ct_ep[VLIB_TX].ce_ip).data_u32;
+ b = trk->ct_ep[VLIB_TX].ce_port << 16 | trk->ct_ep[VLIB_RX].ce_port;
+ c = ip_addr_v4 (&trk->ct_ep[VLIB_RX].ce_ip).data_u32;
+ hash_v3_mix32 (a, b, c);
+ hash_v3_finalize32 (a, b, c);
+ h1 = c;
+ h2 = b;
+ }
+ else
+ {
+ u64 a, b, c;
+ a = ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[0] ^
+ ip_addr_v6 (&trk->ct_ep[VLIB_TX].ce_ip).as_u64[1];
+ b = trk->ct_ep[VLIB_TX].ce_port << 16 | trk->ct_ep[VLIB_RX].ce_port;
+ c = ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[0] ^
+ ip_addr_v6 (&trk->ct_ep[VLIB_RX].ce_ip).as_u64[1];
+ hash_mix64 (a, b, c);
+ h1 = c;
+ h2 = b;
+ }
+
+ bk.offset = h1 % cm->maglev_len;
+ bk.skip = h2 % (cm->maglev_len - 1) + 1;
+ bk.index = ep_idx++;
+ bk.trk = trk;
+ vec_add1 (backends, bk);
+ }
+
+ if (0 == ep_idx)
+ return;
+
+ vec_sort_with_function (backends, cnat_maglev_entry_compare);
+
+ /* Don't free if previous vector exists, just zero */
+ vec_validate (ct->lb_maglev, cm->maglev_len);
+ vec_set (ct->lb_maglev, -1);
+
+ while (1)
+ {
+ vec_foreach (bk, backends)
+ {
+ u32 next = 0;
+ u32 c = (bk->offset + next * bk->skip) % cm->maglev_len;
+ while (ct->lb_maglev[c] != (u32) -1)
+ {
+ next++;
+ c = (bk->offset + next * bk->skip) % cm->maglev_len;
+ }
+ ct->lb_maglev[c] = bk->index;
+ done++;
+ if (done == cm->maglev_len)
+ goto finished;
+ }
+ }
+
+finished:
+ vec_free (backends);
+}
+
static void
cnat_translation_stack (cnat_translation_t * ct)
{
@@ -202,6 +331,9 @@ cnat_translation_stack (cnat_translation_t * ct)
vec_foreach (trk, ct->ct_active_paths)
load_balance_set_bucket (lbi, ep_idx++, &trk->ct_dpo);
+ if (ep_idx > 0 && CNAT_LB_MAGLEV == ct->lb_type)
+ cnat_translation_init_maglev (ct);
+
dpo_set (&ct->ct_lb, DPO_LOAD_BALANCE, dproto, lbi);
dpo_stack (cnat_client_dpo, dproto, &ct->ct_lb, &ct->ct_lb);
ct->flags |= CNAT_TRANSLATION_STACKED;
@@ -232,9 +364,9 @@ cnat_translation_delete (u32 id)
}
u32
-cnat_translation_update (cnat_endpoint_t * vip,
- ip_protocol_t proto,
- cnat_endpoint_tuple_t * paths, u8 flags)
+cnat_translation_update (cnat_endpoint_t *vip, ip_protocol_t proto,
+ cnat_endpoint_tuple_t *paths, u8 flags,
+ cnat_lb_type_t lb_type)
{
cnat_endpoint_tuple_t *path;
const cnat_client_t *cc;
@@ -266,6 +398,7 @@ cnat_translation_update (cnat_endpoint_t * vip,
ct->ct_proto = proto;
ct->ct_cci = cci;
ct->index = ct - cnat_translation_pool;
+ ct->lb_type = lb_type;
cnat_add_translation_to_db (cci, vip, proto, ct->index);
cnat_client_translation_added (cci);
@@ -305,6 +438,7 @@ cnat_translation_update (cnat_endpoint_t * vip,
sizeof (trk->ct_ep[VLIB_TX]));
clib_memcpy (&trk->ct_ep[VLIB_RX], &path->src_ep,
sizeof (trk->ct_ep[VLIB_RX]));
+ trk->ct_flags = path->ep_flags;
cnat_tracker_track (ct->index, trk);
}
@@ -345,11 +479,13 @@ u8 *
format_cnat_translation (u8 * s, va_list * args)
{
cnat_translation_t *ct = va_arg (*args, cnat_translation_t *);
+ cnat_main_t *cm = &cnat_main;
cnat_ep_trk_t *ck;
s = format (s, "[%d] ", ct->index);
- s = format (s, "%U %U", format_cnat_endpoint, &ct->ct_vip,
+ s = format (s, "%U %U ", format_cnat_endpoint, &ct->ct_vip,
format_ip_protocol, ct->ct_proto);
+ s = format (s, "lb:%U ", format_cnat_lb_type, ct->lb_type);
vec_foreach (ck, ct->ct_paths)
s = format (s, "\n%U", format_cnat_ep_trk, ck, 2);
@@ -362,6 +498,25 @@ format_cnat_translation (u8 * s, va_list * args)
format_white_space, 2, format_dpo_id, &ct->ct_lb, 2);
}
+ u32 bid = 0;
+ if (CNAT_LB_MAGLEV == ct->lb_type)
+ {
+ s = format (s, "\nmaglev backends map");
+ uword *bitmap = NULL;
+ clib_bitmap_alloc (bitmap, cm->maglev_len);
+ vec_foreach (ck, ct->ct_paths)
+ {
+ clib_bitmap_zero (bitmap);
+ for (u32 i = 0; i < vec_len (ct->lb_maglev); i++)
+ if (ct->lb_maglev[i] == bid)
+ clib_bitmap_set (bitmap, i, 1);
+ s = format (s, "\n backend#%d: %U", bid, format_bitmap_hex, bitmap);
+
+ bid++;
+ }
+ clib_bitmap_free (bitmap);
+ }
+
return (s);
}
@@ -490,6 +645,7 @@ cnat_translation_cli_add_del (vlib_main_t * vm,
cnat_endpoint_tuple_t tmp, *paths = NULL, *path;
unformat_input_t _line_input, *line_input = &_line_input;
clib_error_t *e = 0;
+ cnat_lb_type_t lb_type;
/* Get a line of input. */
if (!unformat_user (input, unformat_line_input, line_input))
@@ -513,6 +669,8 @@ cnat_translation_cli_add_del (vlib_main_t * vm,
vec_add2 (paths, path, 1);
clib_memcpy (path, &tmp, sizeof (cnat_endpoint_tuple_t));
}
+ else if (unformat (line_input, "%U", unformat_cnat_lb_type, &lb_type))
+ ;
else
{
e = clib_error_return (0, "unknown input '%U'",
@@ -522,7 +680,7 @@ cnat_translation_cli_add_del (vlib_main_t * vm,
}
if (INDEX_INVALID == del_index)
- cnat_translation_update (&vip, proto, paths, flags);
+ cnat_translation_update (&vip, proto, paths, flags, lb_type);
else
cnat_translation_delete (del_index);
diff --git a/src/plugins/cnat/cnat_translation.h b/src/plugins/cnat/cnat_translation.h
index 8bec7396050..af0b94867af 100644
--- a/src/plugins/cnat/cnat_translation.h
+++ b/src/plugins/cnat/cnat_translation.h
@@ -28,6 +28,7 @@ extern vlib_combined_counter_main_t cnat_translation_counters;
typedef enum cnat_trk_flag_t_
{
CNAT_TRK_ACTIVE = (1 << 0),
+ CNAT_TRK_FLAG_NO_NAT = (1 << 1),
} cnat_trk_flag_t;
/**
@@ -80,6 +81,12 @@ typedef enum
CNAT_ADDR_N_RESOLUTIONS,
} cnat_addr_resol_type_t;
+typedef enum __attribute__ ((__packed__))
+{
+ CNAT_LB_DEFAULT,
+ CNAT_LB_MAGLEV,
+} cnat_lb_type_t;
+
/**
* Entry used to account for a translation's backend
* waiting for address resolution
@@ -159,6 +166,16 @@ typedef struct cnat_translation_t_
* Translation flags
*/
u8 flags;
+
+ /**
+ * Type of load balancing
+ */
+ cnat_lb_type_t lb_type;
+
+ union
+ {
+ u32 *lb_maglev;
+ };
} cnat_translation_t;
extern cnat_translation_t *cnat_translation_pool;
@@ -174,10 +191,10 @@ extern u8 *format_cnat_translation (u8 * s, va_list * args);
*
* @return the ID of the translation. used to delete and gather stats
*/
-extern u32 cnat_translation_update (cnat_endpoint_t * vip,
+extern u32 cnat_translation_update (cnat_endpoint_t *vip,
ip_protocol_t ip_proto,
- cnat_endpoint_tuple_t *
- backends, u8 flags);
+ cnat_endpoint_tuple_t *backends, u8 flags,
+ cnat_lb_type_t lb_type);
/**
* Delete a translation
diff --git a/src/plugins/cnat/cnat_types.c b/src/plugins/cnat/cnat_types.c
index b6c6628961c..74c1c24389f 100644
--- a/src/plugins/cnat/cnat_types.c
+++ b/src/plugins/cnat/cnat_types.c
@@ -185,6 +185,7 @@ cnat_config (vlib_main_t * vm, unformat_input_t * input)
cm->session_max_age = CNAT_DEFAULT_SESSION_MAX_AGE;
cm->tcp_max_age = CNAT_DEFAULT_TCP_MAX_AGE;
cm->default_scanner_state = CNAT_SCANNER_ON;
+ cm->maglev_len = CNAT_DEFAULT_MAGLEV_LEN;
cm->lazy_init_done = 0;
while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT)
@@ -217,6 +218,8 @@ cnat_config (vlib_main_t * vm, unformat_input_t * input)
;
else if (unformat (input, "tcp-max-age %u", &cm->tcp_max_age))
;
+ else if (unformat (input, "maglev-len %u", &cm->maglev_len))
+ ;
else
return clib_error_return (0, "unknown input '%U'",
format_unformat_error, input);
diff --git a/src/plugins/cnat/cnat_types.h b/src/plugins/cnat/cnat_types.h
index 2c1b7f9be50..f0911d22d75 100644
--- a/src/plugins/cnat/cnat_types.h
+++ b/src/plugins/cnat/cnat_types.h
@@ -42,6 +42,9 @@
#define CNAT_DEFAULT_TRANSLATION_MEMORY (256 << 10)
#define CNAT_DEFAULT_SNAT_MEMORY (64 << 20)
+/* Should be prime >~ 100 * numBackends */
+#define CNAT_DEFAULT_MAGLEV_LEN 1009
+
/* This should be strictly lower than FIB_SOURCE_INTERFACE
* from fib_source.h */
#define CNAT_FIB_SOURCE_PRIORITY 0x02
@@ -69,6 +72,7 @@ typedef struct cnat_endpoint_tuple_t_
{
cnat_endpoint_t dst_ep;
cnat_endpoint_t src_ep;
+ u8 ep_flags; /* cnat_trk_flag_t */
} cnat_endpoint_tuple_t;
typedef struct
@@ -144,6 +148,10 @@ typedef struct cnat_main_
/* Enable or Disable the scanner on startup */
u8 default_scanner_state;
+
+ /* Number of buckets for maglev, should be a
+ * prime >= 100 * max num bakends */
+ u32 maglev_len;
} cnat_main_t;
typedef struct cnat_timestamp_t_