aboutsummaryrefslogtreecommitdiffstats
path: root/src/plugins/lb
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/lb')
-rw-r--r--src/plugins/lb/api.c92
-rw-r--r--src/plugins/lb/cli.c33
-rw-r--r--src/plugins/lb/lb.api35
-rw-r--r--src/plugins/lb/lb.c23
-rw-r--r--src/plugins/lb/lb.h18
-rw-r--r--src/plugins/lb/lb_plugin_doc.md192
-rw-r--r--src/plugins/lb/lb_plugin_doc.rst223
-rw-r--r--src/plugins/lb/lb_test.c99
-rw-r--r--src/plugins/lb/lb_types.api10
-rw-r--r--src/plugins/lb/lbhash.h3
-rw-r--r--src/plugins/lb/node.c102
11 files changed, 548 insertions, 282 deletions
diff --git a/src/plugins/lb/api.c b/src/plugins/lb/api.c
index e44f815cb9c..ea2e482135b 100644
--- a/src/plugins/lb/api.c
+++ b/src/plugins/lb/api.c
@@ -30,17 +30,15 @@
#include <lb/lb.api_enum.h>
#include <lb/lb.api_types.h>
-#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__)
#define REPLY_MSG_ID_BASE lbm->msg_id_base
#include <vlibapi/api_helper_macros.h>
-/* Macro to finish up custom dump fns */
-#define FINISH \
- vec_add1 (s, 0); \
- vl_print (handle, (char *)s); \
- vec_free (s); \
- return handle;
+#define FINISH \
+ vec_add1 (s, 0); \
+ vlib_cli_output (handle, (char *) s); \
+ vec_free (s); \
+ return handle;
static void
vl_api_lb_conf_t_handler
@@ -72,7 +70,7 @@ vl_api_lb_add_del_vip_t_handler
lb_main_t *lbm = &lb_main;
vl_api_lb_conf_reply_t * rmp;
int rv = 0;
- lb_vip_add_args_t args;
+ lb_vip_add_args_t args = {};
/* if port == 0, it means all-port VIP */
if (mp->port == 0)
@@ -130,6 +128,80 @@ vl_api_lb_add_del_vip_t_handler
}
static void
+vl_api_lb_add_del_vip_v2_t_handler (vl_api_lb_add_del_vip_v2_t *mp)
+{
+ lb_main_t *lbm = &lb_main;
+ vl_api_lb_conf_reply_t *rmp;
+ int rv = 0;
+ lb_vip_add_args_t args = {};
+
+ /* if port == 0, it means all-port VIP */
+ if (mp->port == 0)
+ {
+ mp->protocol = ~0;
+ }
+
+ ip_address_decode (&mp->pfx.address, &(args.prefix));
+
+ if (mp->is_del)
+ {
+ u32 vip_index;
+ if (!(rv = lb_vip_find_index (&(args.prefix), mp->pfx.len, mp->protocol,
+ ntohs (mp->port), &vip_index)))
+ rv = lb_vip_del (vip_index);
+ }
+ else
+ {
+ u32 vip_index;
+ lb_vip_type_t type = 0;
+
+ if (ip46_prefix_is_ip4 (&(args.prefix), mp->pfx.len))
+ {
+ if (mp->encap == LB_API_ENCAP_TYPE_GRE4)
+ type = LB_VIP_TYPE_IP4_GRE4;
+ else if (mp->encap == LB_API_ENCAP_TYPE_GRE6)
+ type = LB_VIP_TYPE_IP4_GRE6;
+ else if (mp->encap == LB_API_ENCAP_TYPE_L3DSR)
+ type = LB_VIP_TYPE_IP4_L3DSR;
+ else if (mp->encap == LB_API_ENCAP_TYPE_NAT4)
+ type = LB_VIP_TYPE_IP4_NAT4;
+ }
+ else
+ {
+ if (mp->encap == LB_API_ENCAP_TYPE_GRE4)
+ type = LB_VIP_TYPE_IP6_GRE4;
+ else if (mp->encap == LB_API_ENCAP_TYPE_GRE6)
+ type = LB_VIP_TYPE_IP6_GRE6;
+ else if (mp->encap == LB_API_ENCAP_TYPE_NAT6)
+ type = LB_VIP_TYPE_IP6_NAT6;
+ }
+
+ args.plen = mp->pfx.len;
+ args.protocol = mp->protocol;
+ args.port = ntohs (mp->port);
+ args.type = type;
+ args.new_length = ntohl (mp->new_flows_table_length);
+
+ if (mp->src_ip_sticky)
+ args.src_ip_sticky = 1;
+
+ if (mp->encap == LB_API_ENCAP_TYPE_L3DSR)
+ {
+ args.encap_args.dscp = (u8) (mp->dscp & 0x3F);
+ }
+ else if ((mp->encap == LB_API_ENCAP_TYPE_NAT4) ||
+ (mp->encap == LB_API_ENCAP_TYPE_NAT6))
+ {
+ args.encap_args.srv_type = mp->type;
+ args.encap_args.target_port = ntohs (mp->target_port);
+ }
+
+ rv = lb_vip_add (args, &vip_index);
+ }
+ REPLY_MACRO (VL_API_LB_ADD_DEL_VIP_V2_REPLY);
+}
+
+static void
vl_api_lb_add_del_as_t_handler
(vl_api_lb_add_del_as_t * mp)
{
@@ -211,7 +283,6 @@ static void send_lb_as_details
lb_main_t *lbm = &lb_main;
int msg_size = 0;
u32 *as_index;
- u32 asindex = 0;
/* construct as list under this vip */
lb_as_t *as;
@@ -235,7 +306,6 @@ static void send_lb_as_details
rmp->in_use_since = htonl(as->last_used);
vl_api_send_msg (reg, (u8 *) rmp);
- asindex++;
}
}
@@ -260,7 +330,6 @@ vl_api_lb_as_dump_t_handler
dump_all = (prefix.ip6.as_u64[0] == 0) && (prefix.ip6.as_u64[1] == 0);
- /* *INDENT-OFF* */
pool_foreach (vip, lbm->vips)
{
if ( dump_all
@@ -272,7 +341,6 @@ vl_api_lb_as_dump_t_handler
send_lb_as_details(reg, mp->context, vip);
}
}
- /* *INDENT-ON* */
}
static void
diff --git a/src/plugins/lb/cli.c b/src/plugins/lb/cli.c
index 7b5dc5c8549..afa73ef616c 100644
--- a/src/plugins/lb/cli.c
+++ b/src/plugins/lb/cli.c
@@ -32,6 +32,7 @@ lb_vip_command_fn (vlib_main_t * vm,
clib_error_t *error = 0;
args.new_length = 1024;
+ args.src_ip_sticky = 0;
if (!unformat_user (input, unformat_line_input, line_input))
return 0;
@@ -49,6 +50,8 @@ lb_vip_command_fn (vlib_main_t * vm,
;
else if (unformat(line_input, "del"))
del = 1;
+ else if (unformat (line_input, "src_ip_sticky"))
+ args.src_ip_sticky = 1;
else if (unformat(line_input, "protocol tcp"))
{
args.protocol = (u8)IP_PROTOCOL_TCP;
@@ -177,6 +180,7 @@ done:
return error;
}
+/* clang-format off */
VLIB_CLI_COMMAND (lb_vip_command, static) =
{
.path = "lb vip",
@@ -185,9 +189,10 @@ VLIB_CLI_COMMAND (lb_vip_command, static) =
"[encap (gre6|gre4|l3dsr|nat4|nat6)] "
"[dscp <n>] "
"[type (nodeport|clusterip) target_port <n>] "
- "[new_len <n>] [del]",
+ "[new_len <n>] [src_ip_sticky] [del]",
.function = lb_vip_command_fn,
};
+/* clang-format on */
static clib_error_t *
lb_as_command_fn (vlib_main_t * vm,
@@ -442,24 +447,22 @@ lb_set_interface_nat_command_fn (vlib_main_t * vm,
{
if (lb_nat4_interface_add_del (*sw_if_index, is_del))
{
- error = clib_error_return(
- 0, "%s %U failed", is_del ? "del" : "add",
- format_vnet_sw_interface_name, vnm,
- vnet_get_sw_interface (vnm, *sw_if_index));
- goto done;
- }
- }
+ error = clib_error_return (
+ 0, "%s %U failed", is_del ? "del" : "add",
+ format_vnet_sw_if_index_name, vnm, *sw_if_index);
+ goto done;
+ }
+ }
else
{
if (lb_nat6_interface_add_del (*sw_if_index, is_del))
{
- error = clib_error_return(
- 0, "%s %U failed", is_del ? "del" : "add",
- format_vnet_sw_interface_name, vnm,
- vnet_get_sw_interface (vnm, *sw_if_index));
- goto done;
- }
- }
+ error = clib_error_return (
+ 0, "%s %U failed", is_del ? "del" : "add",
+ format_vnet_sw_if_index_name, vnm, *sw_if_index);
+ goto done;
+ }
+ }
}
done:
diff --git a/src/plugins/lb/lb.api b/src/plugins/lb/lb.api
index 4bf30e76b59..96f047ddbc2 100644
--- a/src/plugins/lb/lb.api
+++ b/src/plugins/lb/lb.api
@@ -1,4 +1,4 @@
-option version = "1.0.0";
+option version = "1.1.0";
import "plugins/lb/lb_types.api";
import "vnet/interface_types.api";
@@ -54,6 +54,39 @@ autoreply define lb_add_del_vip {
option vat_help = "<prefix> [protocol (tcp|udp) port <n>] [encap (gre6|gre4|l3dsr|nat4|nat6)] [dscp <n>] [type (nodeport|clusterip) target_port <n>] [new_len <n>] [del]";
};
+/** \brief Add a virtual address (or prefix)
+ @param client_index - opaque cookie to identify the sender
+ @param context - sender context, to match reply w/ request
+ @param pfx - ip prefix and length
+ @param protocol - tcp or udp.
+ @param port - destination port. (0) means 'all-port VIP'
+ @param encap - Encap is ip4 GRE(0) or ip6 GRE(1) or L3DSR(2) or NAT4(3) or NAT6(4).
+ @param dscp - DSCP bit corresponding to VIP(applicable in L3DSR mode only).
+ @param type - service type(applicable in NAT4/NAT6 mode only).
+ @param target_port - Pod's port corresponding to specific service(applicable in NAT4/NAT6 mode only).
+ @param node_port - Node's port(applicable in NAT4/NAT6 mode only).
+ @param new_flows_table_length - Size of the new connections flow table used
+ for this VIP (must be power of 2).
+ @param src_ip_sticky - source ip based sticky session.
+ @param is_del - The VIP should be removed.
+*/
+autoreply define lb_add_del_vip_v2 {
+ u32 client_index;
+ u32 context;
+ vl_api_address_with_prefix_t pfx;
+ u8 protocol [default=255];
+ u16 port;
+ vl_api_lb_encap_type_t encap;
+ u8 dscp;
+ vl_api_lb_srv_type_t type ; /* LB_API_SRV_TYPE_CLUSTERIP */
+ u16 target_port;
+ u16 node_port;
+ u32 new_flows_table_length [default=1024];
+ bool src_ip_sticky;
+ bool is_del;
+ option vat_help = "<prefix> [protocol (tcp|udp) port <n>] [encap (gre6|gre4|l3dsr|nat4|nat6)] [dscp <n>] [type (nodeport|clusterip) target_port <n>] [new_len <n>] [src_ip_sticky] [del]";
+};
+
/** \brief Add an application server for a given VIP
@param client_index - opaque cookie to identify the sender
@param context - sender context, to match reply w/ request
diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c
index 6fc7f0f92b2..7ae1884ff31 100644
--- a/src/plugins/lb/lb.c
+++ b/src/plugins/lb/lb.c
@@ -198,15 +198,18 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args)
lb_vip_t *vip = va_arg (*args, lb_vip_t *);
u32 indent = format_get_indent (s);
- s = format(s, "%U %U [%lu] %U%s\n"
+ /* clang-format off */
+ s = format(s, "%U %U [%lu] %U%s%s\n"
"%U new_size:%u\n",
format_white_space, indent,
format_lb_vip_type, vip->type,
vip - lbm->vips,
format_ip46_prefix, &vip->prefix, (u32) vip->plen, IP46_TYPE_ANY,
+ lb_vip_is_src_ip_sticky (vip) ? " src_ip_sticky" : "",
(vip->flags & LB_VIP_FLAGS_USED)?"":" removed",
format_white_space, indent,
vip->new_flow_table_mask + 1);
+ /* clang-format on */
if (vip->port != 0)
{
@@ -370,9 +373,9 @@ void lb_garbage_collection()
}
vec_foreach(i, to_be_removed_vips) {
- vip = &lbm->vips[*i];
- pool_put(lbm->vips, vip);
- pool_free(vip->as_indexes);
+ vip = &lbm->vips[*i];
+ pool_free (vip->as_indexes);
+ pool_put (lbm->vips, vip);
}
vec_free(to_be_removed_vips);
@@ -411,7 +414,7 @@ out:
}
//First, let's sort the ASs
- vec_alloc(sort_arr, pool_elts(vip->as_indexes));
+ vec_validate (sort_arr, pool_elts (vip->as_indexes) - 1);
i = 0;
pool_foreach (as_index, vip->as_indexes) {
@@ -422,7 +425,7 @@ out:
sort_arr[i].as_index = as - lbm->ass;
i++;
}
- _vec_len(sort_arr) = i;
+ vec_set_len (sort_arr, i);
vec_sort_with_function(sort_arr, lb_pseudorand_compare);
@@ -1147,6 +1150,10 @@ int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index)
}
vip->flags = LB_VIP_FLAGS_USED;
+ if (args.src_ip_sticky)
+ {
+ vip->flags |= LB_VIP_FLAGS_SRC_IP_STICKY;
+ }
vip->as_indexes = 0;
//Validate counters
@@ -1249,12 +1256,10 @@ int lb_vip_del(u32 vip_index)
return rv;
}
-/* *INDENT-OFF* */
VLIB_PLUGIN_REGISTER () = {
.version = VPP_BUILD_VER,
.description = "Load Balancer (LB)",
};
-/* *INDENT-ON* */
u8 *format_lb_dpo (u8 * s, va_list * va)
{
@@ -1412,7 +1417,7 @@ lb_init (vlib_main_t * vm)
lb_dpo_nat4_port_nodes);
lbm->dpo_nat6_port_type = dpo_register_new_type(&lb_vft,
lb_dpo_nat6_port_nodes);
- lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft);
+ lbm->fib_node_type = fib_node_register_new_type ("lb", &lb_fib_node_vft);
//Init AS reference counters
vlib_refcount_init(&lbm->as_refcount);
diff --git a/src/plugins/lb/lb.h b/src/plugins/lb/lb.h
index ebbb1f6f8f0..46da40970c9 100644
--- a/src/plugins/lb/lb.h
+++ b/src/plugins/lb/lb.h
@@ -22,7 +22,7 @@
* The load-balancer receives traffic destined to VIP (Virtual IP)
* addresses from one or multiple(ECMP) routers.
* The load-balancer tunnels the traffic toward many application servers
- * ensuring session stickyness (i.e. that a single sessions is tunneled
+ * ensuring session stickiness (i.e. that a single sessions is tunneled
* towards a single application server).
*
*/
@@ -324,6 +324,7 @@ typedef struct {
*/
u8 flags;
#define LB_VIP_FLAGS_USED 0x1
+#define LB_VIP_FLAGS_SRC_IP_STICKY 0x2
/**
* Pool of AS indexes used for this VIP.
@@ -346,11 +347,14 @@ typedef struct {
|| (vip)->type == LB_VIP_TYPE_IP4_L3DSR \
|| (vip)->type == LB_VIP_TYPE_IP4_NAT4 )
+#define lb_vip_is_src_ip_sticky(vip) \
+ (((vip)->flags & LB_VIP_FLAGS_SRC_IP_STICKY) != 0)
+
+/* clang-format off */
#define lb_vip_is_gre4(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE4 \
|| (vip)->type == LB_VIP_TYPE_IP4_GRE4) \
&& ((vip)->port == 0))
-
#define lb_vip_is_gre6(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \
|| (vip)->type == LB_VIP_TYPE_IP4_GRE6) \
&& ((vip)->port == 0))
@@ -362,27 +366,28 @@ typedef struct {
#define lb_vip_is_gre6_port(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \
|| (vip)->type == LB_VIP_TYPE_IP4_GRE6) \
&& ((vip)->port != 0))
+/* clang-format on */
always_inline bool
lb_vip_is_l3dsr(const lb_vip_t *vip)
{
- return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port ==0);
+ return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port == 0);
}
always_inline bool
lb_vip_is_l3dsr_port(const lb_vip_t *vip)
{
- return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port !=0);
+ return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port != 0);
}
always_inline bool
lb_vip_is_nat4_port(const lb_vip_t *vip)
{
- return (vip->type == LB_VIP_TYPE_IP4_NAT4 && vip->port !=0);
+ return (vip->type == LB_VIP_TYPE_IP4_NAT4 && vip->port != 0);
}
always_inline bool
lb_vip_is_nat6_port(const lb_vip_t *vip)
{
- return (vip->type == LB_VIP_TYPE_IP6_NAT6 && vip->port !=0);
+ return (vip->type == LB_VIP_TYPE_IP6_NAT6 && vip->port != 0);
}
format_function_t format_lb_vip;
@@ -575,6 +580,7 @@ typedef struct {
u8 plen;
u8 protocol;
u16 port;
+ u8 src_ip_sticky;
lb_vip_type_t type;
u32 new_length;
lb_vip_encap_args_t encap_args;
diff --git a/src/plugins/lb/lb_plugin_doc.md b/src/plugins/lb/lb_plugin_doc.md
deleted file mode 100644
index 5f6538974e9..00000000000
--- a/src/plugins/lb/lb_plugin_doc.md
+++ /dev/null
@@ -1,192 +0,0 @@
-# Load Balancer plugin for VPP {#lb_plugin_doc}
-
-## Version
-
-The load balancer plugin is currently in *beta* version.
-Both CLIs and APIs are subject to *heavy* changes,
-which also means feedback is really welcome regarding features, apis, etc...
-
-## Overview
-
-This plugin provides load balancing for VPP in a way that is largely inspired
-from Google's MagLev: http://research.google.com/pubs/pub44824.html
-
-The load balancer is configured with a set of Virtual IPs (VIP, which can be
-prefixes), and for each VIP, with a set of Application Server addresses (ASs).
-
-There are four encap types to steer traffic to different ASs:
-1). IPv4+GRE ad IPv6+GRE encap types:
-Traffic received for a given VIP (or VIP prefix) is tunneled using GRE towards
-the different ASs in a way that (tries to) ensure that a given session will
-always be tunneled to the same AS.
-
-2). IPv4+L3DSR encap types:
-L3DSR is used to overcome Layer 2 limitations of Direct Server Return Load Balancing.
-It maps VIP to DSCP bits, and reuse TOS bits to transfer DSCP bits
-to server, and then server will get VIP from DSCP-to-VIP mapping.
-
-Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must be using
-the same encap. type (i.e. IPv4+GRE or IPv6+GRE or IPv4+L3DSR).
-Meaning that for a given VIP, all AS addresses must be of the same family.
-
-3). IPv4/IPv6 + NAT4/NAT6 encap types:
-This type provides kube-proxy data plane on user space,
-which is used to replace linux kernel's kube-proxy based on iptables.
-
-Currently, load balancer plugin supports three service types:
-a) Cluster IP plus Port: support any protocols, including TCP, UDP.
-b) Node IP plus Node Port: currently only support UDP.
-c) External Load Balancer.
-
-For Cluster IP plus Port case:
-kube-proxy is configured with a set of Virtual IPs (VIP, which can be
-prefixes), and for each VIP, with a set of AS addresses (ASs).
-
-For a specific session received for a given VIP (or VIP prefix),
-first packet selects a AS according to internal load balancing algorithm,
-then does DNAT operation and sent to chosen AS.
-At the same time, will create a session entry to store AS chosen result.
-Following packets for that session will look up session table first,
-which ensures that a given session will always be routed to the same AS.
-
-For returned packet from AS, it will do SNAT operation and sent out.
-
-Please refer to below for details:
-https://schd.ws/hosted_files/ossna2017/1e/VPP_K8S_GTPU_OSSNA.pdf
-
-
-## Performance
-
-The load balancer has been tested up to 1 millions flows and still forwards more
-than 3Mpps per core in such circumstances.
-Although 3Mpps seems already good, it is likely that performance will be improved
-in next versions.
-
-## Configuration
-
-### Global LB parameters
-
-The load balancer needs to be configured with some parameters:
-
- lb conf [ip4-src-address <addr>] [ip6-src-address <addr>]
- [buckets <n>] [timeout <s>]
-
-ip4-src-address: the source address used to send encap. packets using IPv4 for GRE4 mode.
- or Node IP4 address for NAT4 mode.
-
-ip6-src-address: the source address used to send encap. packets using IPv6 for GRE6 mode.
- or Node IP6 address for NAT6 mode.
-
-buckets: the *per-thread* established-connections-table number of buckets.
-
-timeout: the number of seconds a connection will remain in the
- established-connections-table while no packet for this flow
- is received.
-
-### Configure the VIPs
-
- lb vip <prefix> [encap (gre6|gre4|l3dsr|nat4|nat6)] \
- [dscp <n>] [port <n> target_port <n> node_port <n>] [new_len <n>] [del]
-
-new_len is the size of the new-connection-table. It should be 1 or 2 orders of
-magnitude bigger than the number of ASs for the VIP in order to ensure a good
-load balancing.
-Encap l3dsr and dscp is used to map VIP to dscp bit and rewrite DSCP bit in packets.
-So the selected server could get VIP from DSCP bit in this packet and perform DSR.
-Encap nat4/nat6 and port/target_port/node_port is used to do kube-proxy data plane.
-
-Examples:
-
- lb vip 2002::/16 encap gre6 new_len 1024
- lb vip 2003::/16 encap gre4 new_len 2048
- lb vip 80.0.0.0/8 encap gre6 new_len 16
- lb vip 90.0.0.0/8 encap gre4 new_len 1024
- lb vip 100.0.0.0/8 encap l3dsr dscp 2 new_len 32
- lb vip 90.1.2.1/32 encap nat4 port 3306 target_port 3307 node_port 30964 new_len 1024
- lb vip 2004::/16 encap nat6 port 6306 target_port 6307 node_port 30966 new_len 1024
-
-### Configure the ASs (for each VIP)
-
- lb as <vip-prefix> [<address> [<address> [...]]] [del]
-
-You can add (or delete) as many ASs at a time (for a single VIP).
-Note that the AS address family must correspond to the VIP encap. IP family.
-
-Examples:
-
- lb as 2002::/16 2001::2 2001::3 2001::4
- lb as 2003::/16 10.0.0.1 10.0.0.2
- lb as 80.0.0.0/8 2001::2
- lb as 90.0.0.0/8 10.0.0.1
-
-### Configure SNAT
-
- lb set interface nat4 in <intfc> [del]
-
-Set SNAT feature in a specific interface.
-(applicable in NAT4 mode only)
-
- lb set interface nat6 in <intfc> [del]
-
-Set SNAT feature in a specific interface.
-(applicable in NAT6 mode only)
-
-## Monitoring
-
-The plugin provides quite a bunch of counters and information.
-These are still subject to quite significant changes.
-
- show lb
- show lb vip
- show lb vip verbose
-
- show node counters
-
-
-## Design notes
-
-### Multi-Threading
-
-MagLev is a distributed system which pseudo-randomly generates a
-new-connections-table based on AS names such that each server configured with
-the same set of ASs ends up with the same table. Connection stickyness is then
-ensured with an established-connections-table. Using ECMP, it is assumed (but
-not relied on) that servers will mostly receive traffic for different flows.
-
-This implementation pushes the parallelism a little bit further by using
-one established-connections table per thread. This is equivalent to assuming
-that RSS will make a job similar to ECMP, and is pretty useful as threads don't
-need to get a lock in order to write in the table.
-
-### Hash Table
-
-A load balancer requires an efficient read and write hash table. The hash table
-used by ip6-forward is very read-efficient, but not so much for writing. In
-addition, it is not a big deal if writing into the hash table fails (again,
-MagLev uses a flow table but does not heaviliy relies on it).
-
-The plugin therefore uses a very specific (and stupid) hash table.
- - Fixed (and power of 2) number of buckets (configured at runtime)
- - Fixed (and power of 2) elements per buckets (configured at compilation time)
-
-### Reference counting
-
-When an AS is removed, there is two possible ways to react.
- - Keep using the AS for established connections
- - Change AS for established connections (likely to cause error for TCP)
-
-In the first case, although an AS is removed from the configuration, its
-associated state needs to stay around as long as it is used by at least one
-thread.
-
-In order to avoid locks, a specific reference counter is used. The design is quite
-similar to clib counters but:
- - It is possible to decrease the value
- - Summing will not zero the per-thread counters
- - Only the thread can reallocate its own counters vector (to avoid concurrency issues)
-
-This reference counter is lock free, but reading a count of 0 does not mean
-the value can be freed unless it is ensured by *other* means that no other thread
-is concurrently referencing the object. In the case of this plugin, it is assumed
-that no concurrent event will take place after a few seconds.
-
diff --git a/src/plugins/lb/lb_plugin_doc.rst b/src/plugins/lb/lb_plugin_doc.rst
new file mode 100644
index 00000000000..603453e7848
--- /dev/null
+++ b/src/plugins/lb/lb_plugin_doc.rst
@@ -0,0 +1,223 @@
+Load Balancer plugin
+====================
+
+Version
+-------
+
+The load balancer plugin is currently in *beta* version. Both CLIs and
+APIs are subject to *heavy* changes, which also means feedback is really
+welcome regarding features, apis, etc…
+
+Overview
+--------
+
+This plugin provides load balancing for VPP in a way that is largely
+inspired from Google’s MagLev:
+http://research.google.com/pubs/pub44824.html
+
+The load balancer is configured with a set of Virtual IPs (VIP, which
+can be prefixes), and for each VIP, with a set of Application Server
+addresses (ASs).
+
+There are four encap types to steer traffic to different ASs: 1).
+IPv4+GRE ad IPv6+GRE encap types: Traffic received for a given VIP (or
+VIP prefix) is tunneled using GRE towards the different ASs in a way
+that (tries to) ensure that a given session will always be tunneled to
+the same AS.
+
+2). IPv4+L3DSR encap types: L3DSR is used to overcome Layer 2
+limitations of Direct Server Return Load Balancing. It maps VIP to DSCP
+bits, and reuse TOS bits to transfer DSCP bits to server, and then
+server will get VIP from DSCP-to-VIP mapping.
+
+Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must
+be using the same encap. type (i.e. IPv4+GRE or IPv6+GRE or IPv4+L3DSR).
+Meaning that for a given VIP, all AS addresses must be of the same
+family.
+
+3). IPv4/IPv6 + NAT4/NAT6 encap types: This type provides kube-proxy
+data plane on user space, which is used to replace linux kernel’s
+kube-proxy based on iptables.
+
+Currently, load balancer plugin supports three service types: a) Cluster
+IP plus Port: support any protocols, including TCP, UDP. b) Node IP plus
+Node Port: currently only support UDP. c) External Load Balancer.
+
+For Cluster IP plus Port case: kube-proxy is configured with a set of
+Virtual IPs (VIP, which can be prefixes), and for each VIP, with a set
+of AS addresses (ASs).
+
+For a specific session received for a given VIP (or VIP prefix), first
+packet selects a AS according to internal load balancing algorithm, then
+does DNAT operation and sent to chosen AS. At the same time, will create
+a session entry to store AS chosen result. Following packets for that
+session will look up session table first, which ensures that a given
+session will always be routed to the same AS.
+
+For returned packet from AS, it will do SNAT operation and sent out.
+
+Please refer to below for details:
+https://schd.ws/hosted_files/ossna2017/1e/VPP_K8S_GTPU_OSSNA.pdf
+
+Performance
+-----------
+
+The load balancer has been tested up to 1 millions flows and still
+forwards more than 3Mpps per core in such circumstances. Although 3Mpps
+seems already good, it is likely that performance will be improved in
+next versions.
+
+Configuration
+-------------
+
+Global LB parameters
+~~~~~~~~~~~~~~~~~~~~
+
+The load balancer needs to be configured with some parameters:
+
+::
+
+ lb conf [ip4-src-address <addr>] [ip6-src-address <addr>]
+ [buckets <n>] [timeout <s>]
+
+ip4-src-address: the source address used to send encap. packets using
+IPv4 for GRE4 mode. or Node IP4 address for NAT4 mode.
+
+ip6-src-address: the source address used to send encap. packets using
+IPv6 for GRE6 mode. or Node IP6 address for NAT6 mode.
+
+buckets: the *per-thread* established-connections-table number of
+buckets.
+
+timeout: the number of seconds a connection will remain in the
+established-connections-table while no packet for this flow is received.
+
+Configure the VIPs
+~~~~~~~~~~~~~~~~~~
+
+::
+
+ lb vip <prefix> [encap (gre6|gre4|l3dsr|nat4|nat6)] \
+ [dscp <n>] [port <n> target_port <n> node_port <n>] [new_len <n>] [del]
+
+new_len is the size of the new-connection-table. It should be 1 or 2
+orders of magnitude bigger than the number of ASs for the VIP in order
+to ensure a good load balancing. Encap l3dsr and dscp is used to map VIP
+to dscp bit and rewrite DSCP bit in packets. So the selected server
+could get VIP from DSCP bit in this packet and perform DSR. Encap
+nat4/nat6 and port/target_port/node_port is used to do kube-proxy data
+plane.
+
+Examples:
+
+::
+
+ lb vip 2002::/16 encap gre6 new_len 1024
+ lb vip 2003::/16 encap gre4 new_len 2048
+ lb vip 80.0.0.0/8 encap gre6 new_len 16
+ lb vip 90.0.0.0/8 encap gre4 new_len 1024
+ lb vip 100.0.0.0/8 encap l3dsr dscp 2 new_len 32
+ lb vip 90.1.2.1/32 encap nat4 port 3306 target_port 3307 node_port 30964 new_len 1024
+ lb vip 2004::/16 encap nat6 port 6306 target_port 6307 node_port 30966 new_len 1024
+
+Configure the ASs (for each VIP)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ lb as <vip-prefix> [<address> [<address> [...]]] [del]
+
+You can add (or delete) as many ASs at a time (for a single VIP). Note
+that the AS address family must correspond to the VIP encap. IP family.
+
+Examples:
+
+::
+
+ lb as 2002::/16 2001::2 2001::3 2001::4
+ lb as 2003::/16 10.0.0.1 10.0.0.2
+ lb as 80.0.0.0/8 2001::2
+ lb as 90.0.0.0/8 10.0.0.1
+
+Configure SNAT
+~~~~~~~~~~~~~~
+
+::
+
+ lb set interface nat4 in <intfc> [del]
+
+Set SNAT feature in a specific interface. (applicable in NAT4 mode only)
+
+::
+
+ lb set interface nat6 in <intfc> [del]
+
+Set SNAT feature in a specific interface. (applicable in NAT6 mode only)
+
+Monitoring
+----------
+
+The plugin provides quite a bunch of counters and information. These are
+still subject to quite significant changes.
+
+::
+
+ show lb
+ show lb vip
+ show lb vip verbose
+
+ show node counters
+
+Design notes
+------------
+
+Multi-Threading
+~~~~~~~~~~~~~~~
+
+MagLev is a distributed system which pseudo-randomly generates a
+new-connections-table based on AS names such that each server configured
+with the same set of ASs ends up with the same table. Connection
+stickiness is then ensured with an established-connections-table. Using
+ECMP, it is assumed (but not relied on) that servers will mostly receive
+traffic for different flows.
+
+This implementation pushes the parallelism a little bit further by using
+one established-connections table per thread. This is equivalent to
+assuming that RSS will make a job similar to ECMP, and is pretty useful
+as threads don’t need to get a lock in order to write in the table.
+
+Hash Table
+~~~~~~~~~~
+
+A load balancer requires an efficient read and write hash table. The
+hash table used by ip6-forward is very read-efficient, but not so much
+for writing. In addition, it is not a big deal if writing into the hash
+table fails (again, MagLev uses a flow table but does not heavily
+relies on it).
+
+The plugin therefore uses a very specific (and stupid) hash table. -
+Fixed (and power of 2) number of buckets (configured at runtime) - Fixed
+(and power of 2) elements per buckets (configured at compilation time)
+
+Reference counting
+~~~~~~~~~~~~~~~~~~
+
+When an AS is removed, there is two possible ways to react. - Keep using
+the AS for established connections - Change AS for established
+connections (likely to cause error for TCP)
+
+In the first case, although an AS is removed from the configuration, its
+associated state needs to stay around as long as it is used by at least
+one thread.
+
+In order to avoid locks, a specific reference counter is used. The
+design is quite similar to clib counters but: - It is possible to
+decrease the value - Summing will not zero the per-thread counters -
+Only the thread can reallocate its own counters vector (to avoid
+concurrency issues)
+
+This reference counter is lock free, but reading a count of 0 does not
+mean the value can be freed unless it is ensured by *other* means that
+no other thread is concurrently referencing the object. In the case of
+this plugin, it is assumed that no concurrent event will take place
+after a few seconds.
diff --git a/src/plugins/lb/lb_test.c b/src/plugins/lb/lb_test.c
index 80fc38e2746..f64bdd220b5 100644
--- a/src/plugins/lb/lb_test.c
+++ b/src/plugins/lb/lb_test.c
@@ -207,6 +207,105 @@ static int api_lb_add_del_vip (vat_main_t * vam)
return ret;
}
+static int
+api_lb_add_del_vip_v2 (vat_main_t *vam)
+{
+ unformat_input_t *line_input = vam->input;
+ vl_api_lb_add_del_vip_v2_t *mp;
+ int ret;
+ ip46_address_t ip_prefix;
+ u8 prefix_length = 0;
+ u8 protocol = 0;
+ u32 port = 0;
+ u32 encap = 0;
+ u32 dscp = ~0;
+ u32 srv_type = LB_SRV_TYPE_CLUSTERIP;
+ u32 target_port = 0;
+ u32 new_length = 1024;
+ u8 src_ip_sticky = 0;
+ int is_del = 0;
+
+ if (!unformat (line_input, "%U", unformat_ip46_prefix, &ip_prefix,
+ &prefix_length, IP46_TYPE_ANY, &prefix_length))
+ {
+ errmsg ("lb_add_del_vip: invalid vip prefix\n");
+ return -99;
+ }
+
+ while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT)
+ {
+ if (unformat (line_input, "new_len %d", &new_length))
+ ;
+ else if (unformat (line_input, "del"))
+ is_del = 1;
+ else if (unformat (line_input, "src_ip_sticky"))
+ src_ip_sticky = 1;
+ else if (unformat (line_input, "protocol tcp"))
+ {
+ protocol = IP_PROTOCOL_TCP;
+ }
+ else if (unformat (line_input, "protocol udp"))
+ {
+ protocol = IP_PROTOCOL_UDP;
+ }
+ else if (unformat (line_input, "port %d", &port))
+ ;
+ else if (unformat (line_input, "encap gre4"))
+ encap = LB_ENCAP_TYPE_GRE4;
+ else if (unformat (line_input, "encap gre6"))
+ encap = LB_ENCAP_TYPE_GRE6;
+ else if (unformat (line_input, "encap l3dsr"))
+ encap = LB_ENCAP_TYPE_L3DSR;
+ else if (unformat (line_input, "encap nat4"))
+ encap = LB_ENCAP_TYPE_NAT4;
+ else if (unformat (line_input, "encap nat6"))
+ encap = LB_ENCAP_TYPE_NAT6;
+ else if (unformat (line_input, "dscp %d", &dscp))
+ ;
+ else if (unformat (line_input, "type clusterip"))
+ srv_type = LB_SRV_TYPE_CLUSTERIP;
+ else if (unformat (line_input, "type nodeport"))
+ srv_type = LB_SRV_TYPE_NODEPORT;
+ else if (unformat (line_input, "target_port %d", &target_port))
+ ;
+ else
+ {
+ errmsg ("invalid arguments\n");
+ return -99;
+ }
+ }
+
+ if ((encap != LB_ENCAP_TYPE_L3DSR) && (dscp != ~0))
+ {
+ errmsg ("lb_vip_add error: should not configure dscp for none L3DSR.");
+ return -99;
+ }
+
+ if ((encap == LB_ENCAP_TYPE_L3DSR) && (dscp >= 64))
+ {
+ errmsg ("lb_vip_add error: dscp for L3DSR should be less than 64.");
+ return -99;
+ }
+
+ M (LB_ADD_DEL_VIP, mp);
+ ip_address_encode (&ip_prefix, IP46_TYPE_ANY, &mp->pfx.address);
+ mp->pfx.len = prefix_length;
+ mp->protocol = (u8) protocol;
+ mp->port = htons ((u16) port);
+ mp->encap = (u8) encap;
+ mp->dscp = (u8) dscp;
+ mp->type = (u8) srv_type;
+ mp->target_port = htons ((u16) target_port);
+ mp->node_port = htons ((u16) target_port);
+ mp->new_flows_table_length = htonl (new_length);
+ mp->is_del = is_del;
+ mp->src_ip_sticky = src_ip_sticky;
+
+ S (mp);
+ W (ret);
+ return ret;
+}
+
static int api_lb_add_del_as (vat_main_t * vam)
{
diff --git a/src/plugins/lb/lb_types.api b/src/plugins/lb/lb_types.api
index 3378a5fec4f..a6e1980b6be 100644
--- a/src/plugins/lb/lb_types.api
+++ b/src/plugins/lb/lb_types.api
@@ -28,9 +28,9 @@ enum lb_encap_type
LB_API_ENCAP_TYPE_GRE4 = 0,
LB_API_ENCAP_TYPE_GRE6 = 1,
LB_API_ENCAP_TYPE_L3DSR = 2,
- LB_API_ENCAP_TYPE_NAT4 = 3 ,
- LB_API_ENCAP_TYPE_NAT6 =4,
- LB_API_ENCAP_N_TYPES =5,
+ LB_API_ENCAP_TYPE_NAT4 = 3,
+ LB_API_ENCAP_TYPE_NAT6 = 4,
+ LB_API_ENCAP_N_TYPES = 5,
};
/* Lookup types */
@@ -38,8 +38,8 @@ enum lb_lkp_type_t
{
LB_API_LKP_SAME_IP_PORT = 0,
LB_API_LKP_DIFF_IP_PORT = 1,
- LB_API_LKP_ALL_PORT_IP =2,
- LB_API_LKP_N_TYPES =3,
+ LB_API_LKP_ALL_PORT_IP = 2,
+ LB_API_LKP_N_TYPES = 3,
};
enum lb_vip_type
diff --git a/src/plugins/lb/lbhash.h b/src/plugins/lb/lbhash.h
index f822d79ded8..8253e9d52f0 100644
--- a/src/plugins/lb/lbhash.h
+++ b/src/plugins/lb/lbhash.h
@@ -88,8 +88,7 @@ lb_hash_t *lb_hash_alloc(u32 buckets, u32 timeout)
sizeof(lb_hash_bucket_t) * (buckets + 1);
u8 *mem = 0;
lb_hash_t *h;
- vec_alloc_aligned(mem, size, CLIB_CACHE_LINE_BYTES);
- clib_memset(mem, 0, size);
+ vec_validate_aligned (mem, size - 1, CLIB_CACHE_LINE_BYTES);
h = (lb_hash_t *)mem;
h->buckets_mask = (buckets - 1);
h->timeout = timeout;
diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c
index b5e9da71376..a37fe11a9b4 100644
--- a/src/plugins/lb/node.c
+++ b/src/plugins/lb/node.c
@@ -174,26 +174,22 @@ lb_node_get_other_ports6 (ip6_header_t *ip60)
}
static_always_inline void
-lb_node_get_hash (lb_main_t *lbm, vlib_buffer_t *p, u8 is_input_v4,
- u32 *hash, u32 *vip_idx, u8 per_port_vip)
+lb_node_get_hash (lb_main_t *lbm, vlib_buffer_t *p, u8 is_input_v4, u32 *hash,
+ u32 *vip_idx, u8 per_port_vip)
{
vip_port_key_t key;
clib_bihash_kv_8_8_t kv, value;
+ ip4_header_t *ip40;
+ ip6_header_t *ip60;
+ lb_vip_t *vip0;
+ u64 ports;
/* For vip case, retrieve vip index for ip lookup */
*vip_idx = vnet_buffer (p)->ip.adj_index[VLIB_TX];
- if (per_port_vip)
- {
- /* For per-port-vip case, ip lookup stores placeholder index */
- key.vip_prefix_index = *vip_idx;
- }
-
+ /* Extract the L4 port number from the packet */
if (is_input_v4)
{
- ip4_header_t *ip40;
- u64 ports;
-
ip40 = vlib_buffer_get_current (p);
if (PREDICT_TRUE(
ip40->protocol == IP_PROTOCOL_TCP
@@ -202,20 +198,10 @@ lb_node_get_hash (lb_main_t *lbm, vlib_buffer_t *p, u8 is_input_v4,
| ((u64) ((udp_header_t *) (ip40 + 1))->dst_port);
else
ports = lb_node_get_other_ports4 (ip40);
-
- *hash = lb_hash_hash (*((u64 *) &ip40->address_pair), ports, 0, 0, 0);
-
- if (per_port_vip)
- {
- key.protocol = ip40->protocol;
- key.port = (u16)(ports & 0xFFFF);
- }
}
else
{
- ip6_header_t *ip60;
ip60 = vlib_buffer_get_current (p);
- u64 ports;
if (PREDICT_TRUE(
ip60->protocol == IP_PROTOCOL_TCP
@@ -224,33 +210,68 @@ lb_node_get_hash (lb_main_t *lbm, vlib_buffer_t *p, u8 is_input_v4,
| ((u64) ((udp_header_t *) (ip60 + 1))->dst_port);
else
ports = lb_node_get_other_ports6 (ip60);
-
- *hash = lb_hash_hash (ip60->src_address.as_u64[0],
- ip60->src_address.as_u64[1],
- ip60->dst_address.as_u64[0],
- ip60->dst_address.as_u64[1], ports);
-
- if (per_port_vip)
- {
- key.protocol = ip60->protocol;
- key.port = (u16)(ports & 0xFFFF);
- }
}
- /* For per-port-vip case, retrieve vip index for vip_port_filter table */
if (per_port_vip)
{
+ /* For per-port-vip case, ip lookup stores placeholder index */
+ key.vip_prefix_index = *vip_idx;
+ key.port = (u16) (ports & 0xFFFF);
+ key.rsv = 0;
+ if (is_input_v4)
+ {
+ key.protocol = ip40->protocol;
+ }
+ else
+ {
+ key.protocol = ip60->protocol;
+ }
+
+ /* For per-port-vip case, retrieve vip index for vip_port_filter table */
kv.key = key.as_u64;
- if (clib_bihash_search_8_8(&lbm->vip_index_per_port, &kv, &value) < 0)
- {
- /* return default vip */
- *vip_idx = 0;
- return;
- }
- *vip_idx = value.value;
+ if (clib_bihash_search_8_8 (&lbm->vip_index_per_port, &kv, &value) < 0)
+ {
+ /* Set default vip */
+ *vip_idx = 0;
+ }
+ else
+ {
+ *vip_idx = value.value;
+ }
+ }
+
+ vip0 = pool_elt_at_index (lbm->vips, *vip_idx);
+
+ if (is_input_v4)
+ {
+ if (lb_vip_is_src_ip_sticky (vip0))
+ {
+ *hash = lb_hash_hash (*((u64 *) &ip40->address_pair), 0, 0, 0, 0);
+ }
+ else
+ {
+ *hash =
+ lb_hash_hash (*((u64 *) &ip40->address_pair), ports, 0, 0, 0);
+ }
+ }
+ else
+ {
+ if (lb_vip_is_src_ip_sticky (vip0))
+ {
+ *hash = lb_hash_hash (
+ ip60->src_address.as_u64[0], ip60->src_address.as_u64[1],
+ ip60->dst_address.as_u64[0], ip60->dst_address.as_u64[1], 0);
+ }
+ else
+ {
+ *hash = lb_hash_hash (
+ ip60->src_address.as_u64[0], ip60->src_address.as_u64[1],
+ ip60->dst_address.as_u64[0], ip60->dst_address.as_u64[1], ports);
+ }
}
}
+/* clang-format off */
static_always_inline uword
lb_node_fn (vlib_main_t * vm,
vlib_node_runtime_t * node,
@@ -565,6 +586,7 @@ lb_node_fn (vlib_main_t * vm,
return frame->n_vectors;
}
+/* clang-format on */
u8 *
format_nodeport_lb_trace (u8 * s, va_list * args)