diff options
Diffstat (limited to 'src/plugins/lb')
-rw-r--r-- | src/plugins/lb/api.c | 92 | ||||
-rw-r--r-- | src/plugins/lb/cli.c | 33 | ||||
-rw-r--r-- | src/plugins/lb/lb.api | 35 | ||||
-rw-r--r-- | src/plugins/lb/lb.c | 23 | ||||
-rw-r--r-- | src/plugins/lb/lb.h | 18 | ||||
-rw-r--r-- | src/plugins/lb/lb_plugin_doc.md | 192 | ||||
-rw-r--r-- | src/plugins/lb/lb_plugin_doc.rst | 223 | ||||
-rw-r--r-- | src/plugins/lb/lb_test.c | 99 | ||||
-rw-r--r-- | src/plugins/lb/lb_types.api | 10 | ||||
-rw-r--r-- | src/plugins/lb/lbhash.h | 3 | ||||
-rw-r--r-- | src/plugins/lb/node.c | 102 |
11 files changed, 548 insertions, 282 deletions
diff --git a/src/plugins/lb/api.c b/src/plugins/lb/api.c index e44f815cb9c..ea2e482135b 100644 --- a/src/plugins/lb/api.c +++ b/src/plugins/lb/api.c @@ -30,17 +30,15 @@ #include <lb/lb.api_enum.h> #include <lb/lb.api_types.h> -#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) #define REPLY_MSG_ID_BASE lbm->msg_id_base #include <vlibapi/api_helper_macros.h> -/* Macro to finish up custom dump fns */ -#define FINISH \ - vec_add1 (s, 0); \ - vl_print (handle, (char *)s); \ - vec_free (s); \ - return handle; +#define FINISH \ + vec_add1 (s, 0); \ + vlib_cli_output (handle, (char *) s); \ + vec_free (s); \ + return handle; static void vl_api_lb_conf_t_handler @@ -72,7 +70,7 @@ vl_api_lb_add_del_vip_t_handler lb_main_t *lbm = &lb_main; vl_api_lb_conf_reply_t * rmp; int rv = 0; - lb_vip_add_args_t args; + lb_vip_add_args_t args = {}; /* if port == 0, it means all-port VIP */ if (mp->port == 0) @@ -130,6 +128,80 @@ vl_api_lb_add_del_vip_t_handler } static void +vl_api_lb_add_del_vip_v2_t_handler (vl_api_lb_add_del_vip_v2_t *mp) +{ + lb_main_t *lbm = &lb_main; + vl_api_lb_conf_reply_t *rmp; + int rv = 0; + lb_vip_add_args_t args = {}; + + /* if port == 0, it means all-port VIP */ + if (mp->port == 0) + { + mp->protocol = ~0; + } + + ip_address_decode (&mp->pfx.address, &(args.prefix)); + + if (mp->is_del) + { + u32 vip_index; + if (!(rv = lb_vip_find_index (&(args.prefix), mp->pfx.len, mp->protocol, + ntohs (mp->port), &vip_index))) + rv = lb_vip_del (vip_index); + } + else + { + u32 vip_index; + lb_vip_type_t type = 0; + + if (ip46_prefix_is_ip4 (&(args.prefix), mp->pfx.len)) + { + if (mp->encap == LB_API_ENCAP_TYPE_GRE4) + type = LB_VIP_TYPE_IP4_GRE4; + else if (mp->encap == LB_API_ENCAP_TYPE_GRE6) + type = LB_VIP_TYPE_IP4_GRE6; + else if (mp->encap == LB_API_ENCAP_TYPE_L3DSR) + type = LB_VIP_TYPE_IP4_L3DSR; + else if (mp->encap == LB_API_ENCAP_TYPE_NAT4) + type = LB_VIP_TYPE_IP4_NAT4; + } + else + { + if (mp->encap == LB_API_ENCAP_TYPE_GRE4) + type = LB_VIP_TYPE_IP6_GRE4; + else if (mp->encap == LB_API_ENCAP_TYPE_GRE6) + type = LB_VIP_TYPE_IP6_GRE6; + else if (mp->encap == LB_API_ENCAP_TYPE_NAT6) + type = LB_VIP_TYPE_IP6_NAT6; + } + + args.plen = mp->pfx.len; + args.protocol = mp->protocol; + args.port = ntohs (mp->port); + args.type = type; + args.new_length = ntohl (mp->new_flows_table_length); + + if (mp->src_ip_sticky) + args.src_ip_sticky = 1; + + if (mp->encap == LB_API_ENCAP_TYPE_L3DSR) + { + args.encap_args.dscp = (u8) (mp->dscp & 0x3F); + } + else if ((mp->encap == LB_API_ENCAP_TYPE_NAT4) || + (mp->encap == LB_API_ENCAP_TYPE_NAT6)) + { + args.encap_args.srv_type = mp->type; + args.encap_args.target_port = ntohs (mp->target_port); + } + + rv = lb_vip_add (args, &vip_index); + } + REPLY_MACRO (VL_API_LB_ADD_DEL_VIP_V2_REPLY); +} + +static void vl_api_lb_add_del_as_t_handler (vl_api_lb_add_del_as_t * mp) { @@ -211,7 +283,6 @@ static void send_lb_as_details lb_main_t *lbm = &lb_main; int msg_size = 0; u32 *as_index; - u32 asindex = 0; /* construct as list under this vip */ lb_as_t *as; @@ -235,7 +306,6 @@ static void send_lb_as_details rmp->in_use_since = htonl(as->last_used); vl_api_send_msg (reg, (u8 *) rmp); - asindex++; } } @@ -260,7 +330,6 @@ vl_api_lb_as_dump_t_handler dump_all = (prefix.ip6.as_u64[0] == 0) && (prefix.ip6.as_u64[1] == 0); - /* *INDENT-OFF* */ pool_foreach (vip, lbm->vips) { if ( dump_all @@ -272,7 +341,6 @@ vl_api_lb_as_dump_t_handler send_lb_as_details(reg, mp->context, vip); } } - /* *INDENT-ON* */ } static void diff --git a/src/plugins/lb/cli.c b/src/plugins/lb/cli.c index 7b5dc5c8549..afa73ef616c 100644 --- a/src/plugins/lb/cli.c +++ b/src/plugins/lb/cli.c @@ -32,6 +32,7 @@ lb_vip_command_fn (vlib_main_t * vm, clib_error_t *error = 0; args.new_length = 1024; + args.src_ip_sticky = 0; if (!unformat_user (input, unformat_line_input, line_input)) return 0; @@ -49,6 +50,8 @@ lb_vip_command_fn (vlib_main_t * vm, ; else if (unformat(line_input, "del")) del = 1; + else if (unformat (line_input, "src_ip_sticky")) + args.src_ip_sticky = 1; else if (unformat(line_input, "protocol tcp")) { args.protocol = (u8)IP_PROTOCOL_TCP; @@ -177,6 +180,7 @@ done: return error; } +/* clang-format off */ VLIB_CLI_COMMAND (lb_vip_command, static) = { .path = "lb vip", @@ -185,9 +189,10 @@ VLIB_CLI_COMMAND (lb_vip_command, static) = "[encap (gre6|gre4|l3dsr|nat4|nat6)] " "[dscp <n>] " "[type (nodeport|clusterip) target_port <n>] " - "[new_len <n>] [del]", + "[new_len <n>] [src_ip_sticky] [del]", .function = lb_vip_command_fn, }; +/* clang-format on */ static clib_error_t * lb_as_command_fn (vlib_main_t * vm, @@ -442,24 +447,22 @@ lb_set_interface_nat_command_fn (vlib_main_t * vm, { if (lb_nat4_interface_add_del (*sw_if_index, is_del)) { - error = clib_error_return( - 0, "%s %U failed", is_del ? "del" : "add", - format_vnet_sw_interface_name, vnm, - vnet_get_sw_interface (vnm, *sw_if_index)); - goto done; - } - } + error = clib_error_return ( + 0, "%s %U failed", is_del ? "del" : "add", + format_vnet_sw_if_index_name, vnm, *sw_if_index); + goto done; + } + } else { if (lb_nat6_interface_add_del (*sw_if_index, is_del)) { - error = clib_error_return( - 0, "%s %U failed", is_del ? "del" : "add", - format_vnet_sw_interface_name, vnm, - vnet_get_sw_interface (vnm, *sw_if_index)); - goto done; - } - } + error = clib_error_return ( + 0, "%s %U failed", is_del ? "del" : "add", + format_vnet_sw_if_index_name, vnm, *sw_if_index); + goto done; + } + } } done: diff --git a/src/plugins/lb/lb.api b/src/plugins/lb/lb.api index 4bf30e76b59..96f047ddbc2 100644 --- a/src/plugins/lb/lb.api +++ b/src/plugins/lb/lb.api @@ -1,4 +1,4 @@ -option version = "1.0.0"; +option version = "1.1.0"; import "plugins/lb/lb_types.api"; import "vnet/interface_types.api"; @@ -54,6 +54,39 @@ autoreply define lb_add_del_vip { option vat_help = "<prefix> [protocol (tcp|udp) port <n>] [encap (gre6|gre4|l3dsr|nat4|nat6)] [dscp <n>] [type (nodeport|clusterip) target_port <n>] [new_len <n>] [del]"; }; +/** \brief Add a virtual address (or prefix) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param pfx - ip prefix and length + @param protocol - tcp or udp. + @param port - destination port. (0) means 'all-port VIP' + @param encap - Encap is ip4 GRE(0) or ip6 GRE(1) or L3DSR(2) or NAT4(3) or NAT6(4). + @param dscp - DSCP bit corresponding to VIP(applicable in L3DSR mode only). + @param type - service type(applicable in NAT4/NAT6 mode only). + @param target_port - Pod's port corresponding to specific service(applicable in NAT4/NAT6 mode only). + @param node_port - Node's port(applicable in NAT4/NAT6 mode only). + @param new_flows_table_length - Size of the new connections flow table used + for this VIP (must be power of 2). + @param src_ip_sticky - source ip based sticky session. + @param is_del - The VIP should be removed. +*/ +autoreply define lb_add_del_vip_v2 { + u32 client_index; + u32 context; + vl_api_address_with_prefix_t pfx; + u8 protocol [default=255]; + u16 port; + vl_api_lb_encap_type_t encap; + u8 dscp; + vl_api_lb_srv_type_t type ; /* LB_API_SRV_TYPE_CLUSTERIP */ + u16 target_port; + u16 node_port; + u32 new_flows_table_length [default=1024]; + bool src_ip_sticky; + bool is_del; + option vat_help = "<prefix> [protocol (tcp|udp) port <n>] [encap (gre6|gre4|l3dsr|nat4|nat6)] [dscp <n>] [type (nodeport|clusterip) target_port <n>] [new_len <n>] [src_ip_sticky] [del]"; +}; + /** \brief Add an application server for a given VIP @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index 6fc7f0f92b2..7ae1884ff31 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -198,15 +198,18 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) lb_vip_t *vip = va_arg (*args, lb_vip_t *); u32 indent = format_get_indent (s); - s = format(s, "%U %U [%lu] %U%s\n" + /* clang-format off */ + s = format(s, "%U %U [%lu] %U%s%s\n" "%U new_size:%u\n", format_white_space, indent, format_lb_vip_type, vip->type, vip - lbm->vips, format_ip46_prefix, &vip->prefix, (u32) vip->plen, IP46_TYPE_ANY, + lb_vip_is_src_ip_sticky (vip) ? " src_ip_sticky" : "", (vip->flags & LB_VIP_FLAGS_USED)?"":" removed", format_white_space, indent, vip->new_flow_table_mask + 1); + /* clang-format on */ if (vip->port != 0) { @@ -370,9 +373,9 @@ void lb_garbage_collection() } vec_foreach(i, to_be_removed_vips) { - vip = &lbm->vips[*i]; - pool_put(lbm->vips, vip); - pool_free(vip->as_indexes); + vip = &lbm->vips[*i]; + pool_free (vip->as_indexes); + pool_put (lbm->vips, vip); } vec_free(to_be_removed_vips); @@ -411,7 +414,7 @@ out: } //First, let's sort the ASs - vec_alloc(sort_arr, pool_elts(vip->as_indexes)); + vec_validate (sort_arr, pool_elts (vip->as_indexes) - 1); i = 0; pool_foreach (as_index, vip->as_indexes) { @@ -422,7 +425,7 @@ out: sort_arr[i].as_index = as - lbm->ass; i++; } - _vec_len(sort_arr) = i; + vec_set_len (sort_arr, i); vec_sort_with_function(sort_arr, lb_pseudorand_compare); @@ -1147,6 +1150,10 @@ int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index) } vip->flags = LB_VIP_FLAGS_USED; + if (args.src_ip_sticky) + { + vip->flags |= LB_VIP_FLAGS_SRC_IP_STICKY; + } vip->as_indexes = 0; //Validate counters @@ -1249,12 +1256,10 @@ int lb_vip_del(u32 vip_index) return rv; } -/* *INDENT-OFF* */ VLIB_PLUGIN_REGISTER () = { .version = VPP_BUILD_VER, .description = "Load Balancer (LB)", }; -/* *INDENT-ON* */ u8 *format_lb_dpo (u8 * s, va_list * va) { @@ -1412,7 +1417,7 @@ lb_init (vlib_main_t * vm) lb_dpo_nat4_port_nodes); lbm->dpo_nat6_port_type = dpo_register_new_type(&lb_vft, lb_dpo_nat6_port_nodes); - lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft); + lbm->fib_node_type = fib_node_register_new_type ("lb", &lb_fib_node_vft); //Init AS reference counters vlib_refcount_init(&lbm->as_refcount); diff --git a/src/plugins/lb/lb.h b/src/plugins/lb/lb.h index ebbb1f6f8f0..46da40970c9 100644 --- a/src/plugins/lb/lb.h +++ b/src/plugins/lb/lb.h @@ -22,7 +22,7 @@ * The load-balancer receives traffic destined to VIP (Virtual IP) * addresses from one or multiple(ECMP) routers. * The load-balancer tunnels the traffic toward many application servers - * ensuring session stickyness (i.e. that a single sessions is tunneled + * ensuring session stickiness (i.e. that a single sessions is tunneled * towards a single application server). * */ @@ -324,6 +324,7 @@ typedef struct { */ u8 flags; #define LB_VIP_FLAGS_USED 0x1 +#define LB_VIP_FLAGS_SRC_IP_STICKY 0x2 /** * Pool of AS indexes used for this VIP. @@ -346,11 +347,14 @@ typedef struct { || (vip)->type == LB_VIP_TYPE_IP4_L3DSR \ || (vip)->type == LB_VIP_TYPE_IP4_NAT4 ) +#define lb_vip_is_src_ip_sticky(vip) \ + (((vip)->flags & LB_VIP_FLAGS_SRC_IP_STICKY) != 0) + +/* clang-format off */ #define lb_vip_is_gre4(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE4 \ || (vip)->type == LB_VIP_TYPE_IP4_GRE4) \ && ((vip)->port == 0)) - #define lb_vip_is_gre6(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \ || (vip)->type == LB_VIP_TYPE_IP4_GRE6) \ && ((vip)->port == 0)) @@ -362,27 +366,28 @@ typedef struct { #define lb_vip_is_gre6_port(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \ || (vip)->type == LB_VIP_TYPE_IP4_GRE6) \ && ((vip)->port != 0)) +/* clang-format on */ always_inline bool lb_vip_is_l3dsr(const lb_vip_t *vip) { - return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port ==0); + return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port == 0); } always_inline bool lb_vip_is_l3dsr_port(const lb_vip_t *vip) { - return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port !=0); + return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port != 0); } always_inline bool lb_vip_is_nat4_port(const lb_vip_t *vip) { - return (vip->type == LB_VIP_TYPE_IP4_NAT4 && vip->port !=0); + return (vip->type == LB_VIP_TYPE_IP4_NAT4 && vip->port != 0); } always_inline bool lb_vip_is_nat6_port(const lb_vip_t *vip) { - return (vip->type == LB_VIP_TYPE_IP6_NAT6 && vip->port !=0); + return (vip->type == LB_VIP_TYPE_IP6_NAT6 && vip->port != 0); } format_function_t format_lb_vip; @@ -575,6 +580,7 @@ typedef struct { u8 plen; u8 protocol; u16 port; + u8 src_ip_sticky; lb_vip_type_t type; u32 new_length; lb_vip_encap_args_t encap_args; diff --git a/src/plugins/lb/lb_plugin_doc.md b/src/plugins/lb/lb_plugin_doc.md deleted file mode 100644 index 5f6538974e9..00000000000 --- a/src/plugins/lb/lb_plugin_doc.md +++ /dev/null @@ -1,192 +0,0 @@ -# Load Balancer plugin for VPP {#lb_plugin_doc} - -## Version - -The load balancer plugin is currently in *beta* version. -Both CLIs and APIs are subject to *heavy* changes, -which also means feedback is really welcome regarding features, apis, etc... - -## Overview - -This plugin provides load balancing for VPP in a way that is largely inspired -from Google's MagLev: http://research.google.com/pubs/pub44824.html - -The load balancer is configured with a set of Virtual IPs (VIP, which can be -prefixes), and for each VIP, with a set of Application Server addresses (ASs). - -There are four encap types to steer traffic to different ASs: -1). IPv4+GRE ad IPv6+GRE encap types: -Traffic received for a given VIP (or VIP prefix) is tunneled using GRE towards -the different ASs in a way that (tries to) ensure that a given session will -always be tunneled to the same AS. - -2). IPv4+L3DSR encap types: -L3DSR is used to overcome Layer 2 limitations of Direct Server Return Load Balancing. -It maps VIP to DSCP bits, and reuse TOS bits to transfer DSCP bits -to server, and then server will get VIP from DSCP-to-VIP mapping. - -Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must be using -the same encap. type (i.e. IPv4+GRE or IPv6+GRE or IPv4+L3DSR). -Meaning that for a given VIP, all AS addresses must be of the same family. - -3). IPv4/IPv6 + NAT4/NAT6 encap types: -This type provides kube-proxy data plane on user space, -which is used to replace linux kernel's kube-proxy based on iptables. - -Currently, load balancer plugin supports three service types: -a) Cluster IP plus Port: support any protocols, including TCP, UDP. -b) Node IP plus Node Port: currently only support UDP. -c) External Load Balancer. - -For Cluster IP plus Port case: -kube-proxy is configured with a set of Virtual IPs (VIP, which can be -prefixes), and for each VIP, with a set of AS addresses (ASs). - -For a specific session received for a given VIP (or VIP prefix), -first packet selects a AS according to internal load balancing algorithm, -then does DNAT operation and sent to chosen AS. -At the same time, will create a session entry to store AS chosen result. -Following packets for that session will look up session table first, -which ensures that a given session will always be routed to the same AS. - -For returned packet from AS, it will do SNAT operation and sent out. - -Please refer to below for details: -https://schd.ws/hosted_files/ossna2017/1e/VPP_K8S_GTPU_OSSNA.pdf - - -## Performance - -The load balancer has been tested up to 1 millions flows and still forwards more -than 3Mpps per core in such circumstances. -Although 3Mpps seems already good, it is likely that performance will be improved -in next versions. - -## Configuration - -### Global LB parameters - -The load balancer needs to be configured with some parameters: - - lb conf [ip4-src-address <addr>] [ip6-src-address <addr>] - [buckets <n>] [timeout <s>] - -ip4-src-address: the source address used to send encap. packets using IPv4 for GRE4 mode. - or Node IP4 address for NAT4 mode. - -ip6-src-address: the source address used to send encap. packets using IPv6 for GRE6 mode. - or Node IP6 address for NAT6 mode. - -buckets: the *per-thread* established-connections-table number of buckets. - -timeout: the number of seconds a connection will remain in the - established-connections-table while no packet for this flow - is received. - -### Configure the VIPs - - lb vip <prefix> [encap (gre6|gre4|l3dsr|nat4|nat6)] \ - [dscp <n>] [port <n> target_port <n> node_port <n>] [new_len <n>] [del] - -new_len is the size of the new-connection-table. It should be 1 or 2 orders of -magnitude bigger than the number of ASs for the VIP in order to ensure a good -load balancing. -Encap l3dsr and dscp is used to map VIP to dscp bit and rewrite DSCP bit in packets. -So the selected server could get VIP from DSCP bit in this packet and perform DSR. -Encap nat4/nat6 and port/target_port/node_port is used to do kube-proxy data plane. - -Examples: - - lb vip 2002::/16 encap gre6 new_len 1024 - lb vip 2003::/16 encap gre4 new_len 2048 - lb vip 80.0.0.0/8 encap gre6 new_len 16 - lb vip 90.0.0.0/8 encap gre4 new_len 1024 - lb vip 100.0.0.0/8 encap l3dsr dscp 2 new_len 32 - lb vip 90.1.2.1/32 encap nat4 port 3306 target_port 3307 node_port 30964 new_len 1024 - lb vip 2004::/16 encap nat6 port 6306 target_port 6307 node_port 30966 new_len 1024 - -### Configure the ASs (for each VIP) - - lb as <vip-prefix> [<address> [<address> [...]]] [del] - -You can add (or delete) as many ASs at a time (for a single VIP). -Note that the AS address family must correspond to the VIP encap. IP family. - -Examples: - - lb as 2002::/16 2001::2 2001::3 2001::4 - lb as 2003::/16 10.0.0.1 10.0.0.2 - lb as 80.0.0.0/8 2001::2 - lb as 90.0.0.0/8 10.0.0.1 - -### Configure SNAT - - lb set interface nat4 in <intfc> [del] - -Set SNAT feature in a specific interface. -(applicable in NAT4 mode only) - - lb set interface nat6 in <intfc> [del] - -Set SNAT feature in a specific interface. -(applicable in NAT6 mode only) - -## Monitoring - -The plugin provides quite a bunch of counters and information. -These are still subject to quite significant changes. - - show lb - show lb vip - show lb vip verbose - - show node counters - - -## Design notes - -### Multi-Threading - -MagLev is a distributed system which pseudo-randomly generates a -new-connections-table based on AS names such that each server configured with -the same set of ASs ends up with the same table. Connection stickyness is then -ensured with an established-connections-table. Using ECMP, it is assumed (but -not relied on) that servers will mostly receive traffic for different flows. - -This implementation pushes the parallelism a little bit further by using -one established-connections table per thread. This is equivalent to assuming -that RSS will make a job similar to ECMP, and is pretty useful as threads don't -need to get a lock in order to write in the table. - -### Hash Table - -A load balancer requires an efficient read and write hash table. The hash table -used by ip6-forward is very read-efficient, but not so much for writing. In -addition, it is not a big deal if writing into the hash table fails (again, -MagLev uses a flow table but does not heaviliy relies on it). - -The plugin therefore uses a very specific (and stupid) hash table. - - Fixed (and power of 2) number of buckets (configured at runtime) - - Fixed (and power of 2) elements per buckets (configured at compilation time) - -### Reference counting - -When an AS is removed, there is two possible ways to react. - - Keep using the AS for established connections - - Change AS for established connections (likely to cause error for TCP) - -In the first case, although an AS is removed from the configuration, its -associated state needs to stay around as long as it is used by at least one -thread. - -In order to avoid locks, a specific reference counter is used. The design is quite -similar to clib counters but: - - It is possible to decrease the value - - Summing will not zero the per-thread counters - - Only the thread can reallocate its own counters vector (to avoid concurrency issues) - -This reference counter is lock free, but reading a count of 0 does not mean -the value can be freed unless it is ensured by *other* means that no other thread -is concurrently referencing the object. In the case of this plugin, it is assumed -that no concurrent event will take place after a few seconds. - diff --git a/src/plugins/lb/lb_plugin_doc.rst b/src/plugins/lb/lb_plugin_doc.rst new file mode 100644 index 00000000000..603453e7848 --- /dev/null +++ b/src/plugins/lb/lb_plugin_doc.rst @@ -0,0 +1,223 @@ +Load Balancer plugin +==================== + +Version +------- + +The load balancer plugin is currently in *beta* version. Both CLIs and +APIs are subject to *heavy* changes, which also means feedback is really +welcome regarding features, apis, etc… + +Overview +-------- + +This plugin provides load balancing for VPP in a way that is largely +inspired from Google’s MagLev: +http://research.google.com/pubs/pub44824.html + +The load balancer is configured with a set of Virtual IPs (VIP, which +can be prefixes), and for each VIP, with a set of Application Server +addresses (ASs). + +There are four encap types to steer traffic to different ASs: 1). +IPv4+GRE ad IPv6+GRE encap types: Traffic received for a given VIP (or +VIP prefix) is tunneled using GRE towards the different ASs in a way +that (tries to) ensure that a given session will always be tunneled to +the same AS. + +2). IPv4+L3DSR encap types: L3DSR is used to overcome Layer 2 +limitations of Direct Server Return Load Balancing. It maps VIP to DSCP +bits, and reuse TOS bits to transfer DSCP bits to server, and then +server will get VIP from DSCP-to-VIP mapping. + +Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must +be using the same encap. type (i.e. IPv4+GRE or IPv6+GRE or IPv4+L3DSR). +Meaning that for a given VIP, all AS addresses must be of the same +family. + +3). IPv4/IPv6 + NAT4/NAT6 encap types: This type provides kube-proxy +data plane on user space, which is used to replace linux kernel’s +kube-proxy based on iptables. + +Currently, load balancer plugin supports three service types: a) Cluster +IP plus Port: support any protocols, including TCP, UDP. b) Node IP plus +Node Port: currently only support UDP. c) External Load Balancer. + +For Cluster IP plus Port case: kube-proxy is configured with a set of +Virtual IPs (VIP, which can be prefixes), and for each VIP, with a set +of AS addresses (ASs). + +For a specific session received for a given VIP (or VIP prefix), first +packet selects a AS according to internal load balancing algorithm, then +does DNAT operation and sent to chosen AS. At the same time, will create +a session entry to store AS chosen result. Following packets for that +session will look up session table first, which ensures that a given +session will always be routed to the same AS. + +For returned packet from AS, it will do SNAT operation and sent out. + +Please refer to below for details: +https://schd.ws/hosted_files/ossna2017/1e/VPP_K8S_GTPU_OSSNA.pdf + +Performance +----------- + +The load balancer has been tested up to 1 millions flows and still +forwards more than 3Mpps per core in such circumstances. Although 3Mpps +seems already good, it is likely that performance will be improved in +next versions. + +Configuration +------------- + +Global LB parameters +~~~~~~~~~~~~~~~~~~~~ + +The load balancer needs to be configured with some parameters: + +:: + + lb conf [ip4-src-address <addr>] [ip6-src-address <addr>] + [buckets <n>] [timeout <s>] + +ip4-src-address: the source address used to send encap. packets using +IPv4 for GRE4 mode. or Node IP4 address for NAT4 mode. + +ip6-src-address: the source address used to send encap. packets using +IPv6 for GRE6 mode. or Node IP6 address for NAT6 mode. + +buckets: the *per-thread* established-connections-table number of +buckets. + +timeout: the number of seconds a connection will remain in the +established-connections-table while no packet for this flow is received. + +Configure the VIPs +~~~~~~~~~~~~~~~~~~ + +:: + + lb vip <prefix> [encap (gre6|gre4|l3dsr|nat4|nat6)] \ + [dscp <n>] [port <n> target_port <n> node_port <n>] [new_len <n>] [del] + +new_len is the size of the new-connection-table. It should be 1 or 2 +orders of magnitude bigger than the number of ASs for the VIP in order +to ensure a good load balancing. Encap l3dsr and dscp is used to map VIP +to dscp bit and rewrite DSCP bit in packets. So the selected server +could get VIP from DSCP bit in this packet and perform DSR. Encap +nat4/nat6 and port/target_port/node_port is used to do kube-proxy data +plane. + +Examples: + +:: + + lb vip 2002::/16 encap gre6 new_len 1024 + lb vip 2003::/16 encap gre4 new_len 2048 + lb vip 80.0.0.0/8 encap gre6 new_len 16 + lb vip 90.0.0.0/8 encap gre4 new_len 1024 + lb vip 100.0.0.0/8 encap l3dsr dscp 2 new_len 32 + lb vip 90.1.2.1/32 encap nat4 port 3306 target_port 3307 node_port 30964 new_len 1024 + lb vip 2004::/16 encap nat6 port 6306 target_port 6307 node_port 30966 new_len 1024 + +Configure the ASs (for each VIP) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + lb as <vip-prefix> [<address> [<address> [...]]] [del] + +You can add (or delete) as many ASs at a time (for a single VIP). Note +that the AS address family must correspond to the VIP encap. IP family. + +Examples: + +:: + + lb as 2002::/16 2001::2 2001::3 2001::4 + lb as 2003::/16 10.0.0.1 10.0.0.2 + lb as 80.0.0.0/8 2001::2 + lb as 90.0.0.0/8 10.0.0.1 + +Configure SNAT +~~~~~~~~~~~~~~ + +:: + + lb set interface nat4 in <intfc> [del] + +Set SNAT feature in a specific interface. (applicable in NAT4 mode only) + +:: + + lb set interface nat6 in <intfc> [del] + +Set SNAT feature in a specific interface. (applicable in NAT6 mode only) + +Monitoring +---------- + +The plugin provides quite a bunch of counters and information. These are +still subject to quite significant changes. + +:: + + show lb + show lb vip + show lb vip verbose + + show node counters + +Design notes +------------ + +Multi-Threading +~~~~~~~~~~~~~~~ + +MagLev is a distributed system which pseudo-randomly generates a +new-connections-table based on AS names such that each server configured +with the same set of ASs ends up with the same table. Connection +stickiness is then ensured with an established-connections-table. Using +ECMP, it is assumed (but not relied on) that servers will mostly receive +traffic for different flows. + +This implementation pushes the parallelism a little bit further by using +one established-connections table per thread. This is equivalent to +assuming that RSS will make a job similar to ECMP, and is pretty useful +as threads don’t need to get a lock in order to write in the table. + +Hash Table +~~~~~~~~~~ + +A load balancer requires an efficient read and write hash table. The +hash table used by ip6-forward is very read-efficient, but not so much +for writing. In addition, it is not a big deal if writing into the hash +table fails (again, MagLev uses a flow table but does not heavily +relies on it). + +The plugin therefore uses a very specific (and stupid) hash table. - +Fixed (and power of 2) number of buckets (configured at runtime) - Fixed +(and power of 2) elements per buckets (configured at compilation time) + +Reference counting +~~~~~~~~~~~~~~~~~~ + +When an AS is removed, there is two possible ways to react. - Keep using +the AS for established connections - Change AS for established +connections (likely to cause error for TCP) + +In the first case, although an AS is removed from the configuration, its +associated state needs to stay around as long as it is used by at least +one thread. + +In order to avoid locks, a specific reference counter is used. The +design is quite similar to clib counters but: - It is possible to +decrease the value - Summing will not zero the per-thread counters - +Only the thread can reallocate its own counters vector (to avoid +concurrency issues) + +This reference counter is lock free, but reading a count of 0 does not +mean the value can be freed unless it is ensured by *other* means that +no other thread is concurrently referencing the object. In the case of +this plugin, it is assumed that no concurrent event will take place +after a few seconds. diff --git a/src/plugins/lb/lb_test.c b/src/plugins/lb/lb_test.c index 80fc38e2746..f64bdd220b5 100644 --- a/src/plugins/lb/lb_test.c +++ b/src/plugins/lb/lb_test.c @@ -207,6 +207,105 @@ static int api_lb_add_del_vip (vat_main_t * vam) return ret; } +static int +api_lb_add_del_vip_v2 (vat_main_t *vam) +{ + unformat_input_t *line_input = vam->input; + vl_api_lb_add_del_vip_v2_t *mp; + int ret; + ip46_address_t ip_prefix; + u8 prefix_length = 0; + u8 protocol = 0; + u32 port = 0; + u32 encap = 0; + u32 dscp = ~0; + u32 srv_type = LB_SRV_TYPE_CLUSTERIP; + u32 target_port = 0; + u32 new_length = 1024; + u8 src_ip_sticky = 0; + int is_del = 0; + + if (!unformat (line_input, "%U", unformat_ip46_prefix, &ip_prefix, + &prefix_length, IP46_TYPE_ANY, &prefix_length)) + { + errmsg ("lb_add_del_vip: invalid vip prefix\n"); + return -99; + } + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "new_len %d", &new_length)) + ; + else if (unformat (line_input, "del")) + is_del = 1; + else if (unformat (line_input, "src_ip_sticky")) + src_ip_sticky = 1; + else if (unformat (line_input, "protocol tcp")) + { + protocol = IP_PROTOCOL_TCP; + } + else if (unformat (line_input, "protocol udp")) + { + protocol = IP_PROTOCOL_UDP; + } + else if (unformat (line_input, "port %d", &port)) + ; + else if (unformat (line_input, "encap gre4")) + encap = LB_ENCAP_TYPE_GRE4; + else if (unformat (line_input, "encap gre6")) + encap = LB_ENCAP_TYPE_GRE6; + else if (unformat (line_input, "encap l3dsr")) + encap = LB_ENCAP_TYPE_L3DSR; + else if (unformat (line_input, "encap nat4")) + encap = LB_ENCAP_TYPE_NAT4; + else if (unformat (line_input, "encap nat6")) + encap = LB_ENCAP_TYPE_NAT6; + else if (unformat (line_input, "dscp %d", &dscp)) + ; + else if (unformat (line_input, "type clusterip")) + srv_type = LB_SRV_TYPE_CLUSTERIP; + else if (unformat (line_input, "type nodeport")) + srv_type = LB_SRV_TYPE_NODEPORT; + else if (unformat (line_input, "target_port %d", &target_port)) + ; + else + { + errmsg ("invalid arguments\n"); + return -99; + } + } + + if ((encap != LB_ENCAP_TYPE_L3DSR) && (dscp != ~0)) + { + errmsg ("lb_vip_add error: should not configure dscp for none L3DSR."); + return -99; + } + + if ((encap == LB_ENCAP_TYPE_L3DSR) && (dscp >= 64)) + { + errmsg ("lb_vip_add error: dscp for L3DSR should be less than 64."); + return -99; + } + + M (LB_ADD_DEL_VIP, mp); + ip_address_encode (&ip_prefix, IP46_TYPE_ANY, &mp->pfx.address); + mp->pfx.len = prefix_length; + mp->protocol = (u8) protocol; + mp->port = htons ((u16) port); + mp->encap = (u8) encap; + mp->dscp = (u8) dscp; + mp->type = (u8) srv_type; + mp->target_port = htons ((u16) target_port); + mp->node_port = htons ((u16) target_port); + mp->new_flows_table_length = htonl (new_length); + mp->is_del = is_del; + mp->src_ip_sticky = src_ip_sticky; + + S (mp); + W (ret); + return ret; +} + static int api_lb_add_del_as (vat_main_t * vam) { diff --git a/src/plugins/lb/lb_types.api b/src/plugins/lb/lb_types.api index 3378a5fec4f..a6e1980b6be 100644 --- a/src/plugins/lb/lb_types.api +++ b/src/plugins/lb/lb_types.api @@ -28,9 +28,9 @@ enum lb_encap_type LB_API_ENCAP_TYPE_GRE4 = 0, LB_API_ENCAP_TYPE_GRE6 = 1, LB_API_ENCAP_TYPE_L3DSR = 2, - LB_API_ENCAP_TYPE_NAT4 = 3 , - LB_API_ENCAP_TYPE_NAT6 =4, - LB_API_ENCAP_N_TYPES =5, + LB_API_ENCAP_TYPE_NAT4 = 3, + LB_API_ENCAP_TYPE_NAT6 = 4, + LB_API_ENCAP_N_TYPES = 5, }; /* Lookup types */ @@ -38,8 +38,8 @@ enum lb_lkp_type_t { LB_API_LKP_SAME_IP_PORT = 0, LB_API_LKP_DIFF_IP_PORT = 1, - LB_API_LKP_ALL_PORT_IP =2, - LB_API_LKP_N_TYPES =3, + LB_API_LKP_ALL_PORT_IP = 2, + LB_API_LKP_N_TYPES = 3, }; enum lb_vip_type diff --git a/src/plugins/lb/lbhash.h b/src/plugins/lb/lbhash.h index f822d79ded8..8253e9d52f0 100644 --- a/src/plugins/lb/lbhash.h +++ b/src/plugins/lb/lbhash.h @@ -88,8 +88,7 @@ lb_hash_t *lb_hash_alloc(u32 buckets, u32 timeout) sizeof(lb_hash_bucket_t) * (buckets + 1); u8 *mem = 0; lb_hash_t *h; - vec_alloc_aligned(mem, size, CLIB_CACHE_LINE_BYTES); - clib_memset(mem, 0, size); + vec_validate_aligned (mem, size - 1, CLIB_CACHE_LINE_BYTES); h = (lb_hash_t *)mem; h->buckets_mask = (buckets - 1); h->timeout = timeout; diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c index b5e9da71376..a37fe11a9b4 100644 --- a/src/plugins/lb/node.c +++ b/src/plugins/lb/node.c @@ -174,26 +174,22 @@ lb_node_get_other_ports6 (ip6_header_t *ip60) } static_always_inline void -lb_node_get_hash (lb_main_t *lbm, vlib_buffer_t *p, u8 is_input_v4, - u32 *hash, u32 *vip_idx, u8 per_port_vip) +lb_node_get_hash (lb_main_t *lbm, vlib_buffer_t *p, u8 is_input_v4, u32 *hash, + u32 *vip_idx, u8 per_port_vip) { vip_port_key_t key; clib_bihash_kv_8_8_t kv, value; + ip4_header_t *ip40; + ip6_header_t *ip60; + lb_vip_t *vip0; + u64 ports; /* For vip case, retrieve vip index for ip lookup */ *vip_idx = vnet_buffer (p)->ip.adj_index[VLIB_TX]; - if (per_port_vip) - { - /* For per-port-vip case, ip lookup stores placeholder index */ - key.vip_prefix_index = *vip_idx; - } - + /* Extract the L4 port number from the packet */ if (is_input_v4) { - ip4_header_t *ip40; - u64 ports; - ip40 = vlib_buffer_get_current (p); if (PREDICT_TRUE( ip40->protocol == IP_PROTOCOL_TCP @@ -202,20 +198,10 @@ lb_node_get_hash (lb_main_t *lbm, vlib_buffer_t *p, u8 is_input_v4, | ((u64) ((udp_header_t *) (ip40 + 1))->dst_port); else ports = lb_node_get_other_ports4 (ip40); - - *hash = lb_hash_hash (*((u64 *) &ip40->address_pair), ports, 0, 0, 0); - - if (per_port_vip) - { - key.protocol = ip40->protocol; - key.port = (u16)(ports & 0xFFFF); - } } else { - ip6_header_t *ip60; ip60 = vlib_buffer_get_current (p); - u64 ports; if (PREDICT_TRUE( ip60->protocol == IP_PROTOCOL_TCP @@ -224,33 +210,68 @@ lb_node_get_hash (lb_main_t *lbm, vlib_buffer_t *p, u8 is_input_v4, | ((u64) ((udp_header_t *) (ip60 + 1))->dst_port); else ports = lb_node_get_other_ports6 (ip60); - - *hash = lb_hash_hash (ip60->src_address.as_u64[0], - ip60->src_address.as_u64[1], - ip60->dst_address.as_u64[0], - ip60->dst_address.as_u64[1], ports); - - if (per_port_vip) - { - key.protocol = ip60->protocol; - key.port = (u16)(ports & 0xFFFF); - } } - /* For per-port-vip case, retrieve vip index for vip_port_filter table */ if (per_port_vip) { + /* For per-port-vip case, ip lookup stores placeholder index */ + key.vip_prefix_index = *vip_idx; + key.port = (u16) (ports & 0xFFFF); + key.rsv = 0; + if (is_input_v4) + { + key.protocol = ip40->protocol; + } + else + { + key.protocol = ip60->protocol; + } + + /* For per-port-vip case, retrieve vip index for vip_port_filter table */ kv.key = key.as_u64; - if (clib_bihash_search_8_8(&lbm->vip_index_per_port, &kv, &value) < 0) - { - /* return default vip */ - *vip_idx = 0; - return; - } - *vip_idx = value.value; + if (clib_bihash_search_8_8 (&lbm->vip_index_per_port, &kv, &value) < 0) + { + /* Set default vip */ + *vip_idx = 0; + } + else + { + *vip_idx = value.value; + } + } + + vip0 = pool_elt_at_index (lbm->vips, *vip_idx); + + if (is_input_v4) + { + if (lb_vip_is_src_ip_sticky (vip0)) + { + *hash = lb_hash_hash (*((u64 *) &ip40->address_pair), 0, 0, 0, 0); + } + else + { + *hash = + lb_hash_hash (*((u64 *) &ip40->address_pair), ports, 0, 0, 0); + } + } + else + { + if (lb_vip_is_src_ip_sticky (vip0)) + { + *hash = lb_hash_hash ( + ip60->src_address.as_u64[0], ip60->src_address.as_u64[1], + ip60->dst_address.as_u64[0], ip60->dst_address.as_u64[1], 0); + } + else + { + *hash = lb_hash_hash ( + ip60->src_address.as_u64[0], ip60->src_address.as_u64[1], + ip60->dst_address.as_u64[0], ip60->dst_address.as_u64[1], ports); + } } } +/* clang-format off */ static_always_inline uword lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, @@ -565,6 +586,7 @@ lb_node_fn (vlib_main_t * vm, return frame->n_vectors; } +/* clang-format on */ u8 * format_nodeport_lb_trace (u8 * s, va_list * args) |