From d92a0b553fd2872b4fcda25994aaa8852d254824 Mon Sep 17 00:00:00 2001 From: Hongjun Ni Date: Tue, 6 Feb 2018 23:00:22 +0800 Subject: Rework kube-proxy into LB plugin Add support of NAT66 Change-Id: Ie6aa79078a3835f989829b9a597c448dfd2f9ea3 Signed-off-by: Hongjun Ni --- src/plugins/lb/api.c | 80 ++- src/plugins/lb/cli.c | 189 +++++- src/plugins/lb/lb.api | 16 +- src/plugins/lb/lb.c | 360 +++++++++-- src/plugins/lb/lb.h | 211 ++++++- src/plugins/lb/lb_plugin_doc.md | 64 +- src/plugins/lb/lb_test.c | 8 +- src/plugins/lb/node.c | 1269 +++++++++++++++++++++++++++++---------- 8 files changed, 1772 insertions(+), 425 deletions(-) (limited to 'src/plugins/lb') diff --git a/src/plugins/lb/api.c b/src/plugins/lb/api.c index 28af6daa421..beec4ae5876 100644 --- a/src/plugins/lb/api.c +++ b/src/plugins/lb/api.c @@ -107,33 +107,52 @@ vl_api_lb_add_del_vip_t_handler lb_main_t *lbm = &lb_main; vl_api_lb_conf_reply_t * rmp; int rv = 0; - ip46_address_t prefix; - memcpy(&prefix.ip6, mp->ip_prefix, sizeof(prefix.ip6)); + lb_vip_add_args_t args; + + memcpy (&(args.prefix.ip6), mp->ip_prefix, sizeof(args.prefix.ip6)); if (mp->is_del) { u32 vip_index; - if (!(rv = lb_vip_find_index(&prefix, mp->prefix_length, &vip_index))) + if (!(rv = lb_vip_find_index(&(args.prefix), mp->prefix_length, &vip_index))) rv = lb_vip_del(vip_index); } else { u32 vip_index; lb_vip_type_t type = 0; - if (ip46_prefix_is_ip4(&prefix, mp->prefix_length)) { + if (ip46_prefix_is_ip4(&(args.prefix), mp->prefix_length)) { if (mp->encap == LB_ENCAP_TYPE_GRE4) - type = LB_VIP_TYPE_IP4_GRE4; + type = LB_VIP_TYPE_IP4_GRE4; else if (mp->encap == LB_ENCAP_TYPE_GRE6) - type = LB_VIP_TYPE_IP4_GRE6; + type = LB_VIP_TYPE_IP4_GRE6; else if (mp->encap == LB_ENCAP_TYPE_L3DSR) - type = LB_VIP_TYPE_IP4_L3DSR; + type = LB_VIP_TYPE_IP4_L3DSR; + else if (mp->encap == LB_ENCAP_TYPE_NAT4) + type = LB_VIP_TYPE_IP4_NAT4; } else { if (mp->encap == LB_ENCAP_TYPE_GRE4) - type = LB_VIP_TYPE_IP6_GRE4; + type = LB_VIP_TYPE_IP6_GRE4; else if (mp->encap == LB_ENCAP_TYPE_GRE6) - type = LB_VIP_TYPE_IP6_GRE6; + type = LB_VIP_TYPE_IP6_GRE6; + else if (mp->encap == LB_ENCAP_TYPE_NAT6) + type = LB_VIP_TYPE_IP6_NAT6; } - rv = lb_vip_add(&prefix, mp->prefix_length, type, mp->dscp, - mp->new_flows_table_length, &vip_index); + args.plen = mp->prefix_length; + args.type = type; + args.new_length = mp->new_flows_table_length; + + if (mp->encap == LB_ENCAP_TYPE_L3DSR) { + args.encap_args.dscp = (u8)(mp->dscp & 0x3F); + } + else if ((mp->encap == LB_ENCAP_TYPE_NAT4) + ||(mp->encap == LB_ENCAP_TYPE_NAT6)) { + args.encap_args.srv_type = mp->type; + args.encap_args.port = ntohs(mp->port); + args.encap_args.target_port = ntohs(mp->target_port); + args.encap_args.node_port = ntohs(mp->node_port); + } + + rv = lb_vip_add(args, &vip_index); } REPLY_MACRO (VL_API_LB_CONF_REPLY); } @@ -146,8 +165,26 @@ static void *vl_api_lb_add_del_vip_t_print s = format (s, "%U ", format_ip46_prefix, (ip46_address_t *)mp->ip_prefix, mp->prefix_length, IP46_TYPE_ANY); - s = format (s, "%s ", (mp->encap==LB_ENCAP_TYPE_GRE4)? - "gre4":(mp->encap==LB_ENCAP_TYPE_GRE6)?"gre6":"l3dsr"); + s = format (s, "%s ", (mp->encap == LB_ENCAP_TYPE_GRE4)? "gre4" + : (mp->encap == LB_ENCAP_TYPE_GRE6)? "gre6" + : (mp->encap == LB_ENCAP_TYPE_NAT4)? "nat4" + : (mp->encap == LB_ENCAP_TYPE_NAT6)? "nat6" + : "l3dsr"); + + if (mp->encap==LB_ENCAP_TYPE_L3DSR) + { + s = format (s, "dscp %u ", mp->dscp); + } + + if ((mp->encap==LB_ENCAP_TYPE_NAT4) + || (mp->encap==LB_ENCAP_TYPE_NAT6)) + { + s = format (s, "type %u ", mp->type); + s = format (s, "port %u ", mp->port); + s = format (s, "target_port %u ", mp->target_port); + s = format (s, "node_port %u ", mp->node_port); + } + s = format (s, "%u ", mp->new_flows_table_length); s = format (s, "%s ", mp->is_del?"del":"add"); FINISH; @@ -161,14 +198,23 @@ vl_api_lb_add_del_as_t_handler vl_api_lb_conf_reply_t * rmp; int rv = 0; u32 vip_index; - if ((rv = lb_vip_find_index((ip46_address_t *)mp->vip_ip_prefix, - mp->vip_prefix_length, &vip_index))) + ip46_address_t vip_ip_prefix; + + memcpy(&vip_ip_prefix.ip6, mp->vip_ip_prefix, + sizeof(vip_ip_prefix.ip6)); + + ip46_address_t as_address; + + memcpy(&as_address.ip6, mp->as_address, + sizeof(as_address.ip6)); + + if ((rv = lb_vip_find_index(&vip_ip_prefix, mp->vip_prefix_length, &vip_index))) goto done; if (mp->is_del) - rv = lb_vip_del_ass(vip_index, (ip46_address_t *)mp->as_address, 1); + rv = lb_vip_del_ass(vip_index, &as_address, 1); else - rv = lb_vip_add_ass(vip_index, (ip46_address_t *)mp->as_address, 1); + rv = lb_vip_add_ass(vip_index, &as_address, 1); done: REPLY_MACRO (VL_API_LB_CONF_REPLY); diff --git a/src/plugins/lb/cli.c b/src/plugins/lb/cli.c index b29605af984..2bd06b98b6a 100644 --- a/src/plugins/lb/cli.c +++ b/src/plugins/lb/cli.c @@ -21,20 +21,24 @@ lb_vip_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { unformat_input_t _line_input, *line_input = &_line_input; - ip46_address_t prefix; - u8 plen; - u32 new_len = 1024; + lb_vip_add_args_t args; u8 del = 0; int ret; u32 encap = 0; u32 dscp = ~0; - lb_vip_type_t type = 0; + u32 srv_type = LB_SRV_TYPE_CLUSTERIP; + u32 port = 0; + u32 target_port = 0; + u32 node_port = 0; clib_error_t *error = 0; + args.new_length = 1024; + if (!unformat_user (input, unformat_line_input, line_input)) return 0; - if (!unformat(line_input, "%U", unformat_ip46_prefix, &prefix, &plen, IP46_TYPE_ANY)) { + if (!unformat(line_input, "%U", unformat_ip46_prefix, &(args.prefix), + &(args.plen), IP46_TYPE_ANY, &(args.plen))) { error = clib_error_return (0, "invalid vip prefix: '%U'", format_unformat_error, line_input); goto done; @@ -42,7 +46,7 @@ lb_vip_command_fn (vlib_main_t * vm, while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { - if (unformat(line_input, "new_len %d", &new_len)) + if (unformat(line_input, "new_len %d", &(args.new_length))) ; else if (unformat(line_input, "del")) del = 1; @@ -52,8 +56,22 @@ lb_vip_command_fn (vlib_main_t * vm, encap = LB_ENCAP_TYPE_GRE6; else if (unformat(line_input, "encap l3dsr")) encap = LB_ENCAP_TYPE_L3DSR; + else if (unformat(line_input, "encap nat4")) + encap = LB_ENCAP_TYPE_NAT4; + else if (unformat(line_input, "encap nat6")) + encap = LB_ENCAP_TYPE_NAT6; else if (unformat(line_input, "dscp %d", &dscp)) ; + else if (unformat(line_input, "type clusterip")) + srv_type = LB_SRV_TYPE_CLUSTERIP; + else if (unformat(line_input, "type nodeport")) + srv_type = LB_SRV_TYPE_NODEPORT; + else if (unformat(line_input, "port %d", &port)) + ; + else if (unformat(line_input, "target_port %d", &target_port)) + ; + else if (unformat(line_input, "node_port %d", &node_port)) + ; else { error = clib_error_return (0, "parse error: '%U'", format_unformat_error, line_input); @@ -61,46 +79,75 @@ lb_vip_command_fn (vlib_main_t * vm, } } - if ((encap != LB_ENCAP_TYPE_L3DSR) && (dscp != ~0) ) + if ((encap != LB_ENCAP_TYPE_L3DSR) && (dscp != ~0)) { - error = clib_error_return (0, "lb_vip_add error: " - "should not configure dscp for none L3DSR."); + error = clib_error_return(0, "lb_vip_add error: " + "should not configure dscp for none L3DSR."); goto done; } - if ((encap == LB_ENCAP_TYPE_L3DSR) && (dscp >= 64 ) ) + if ((encap == LB_ENCAP_TYPE_L3DSR) && (dscp >= 64)) { - error = clib_error_return (0, "lb_vip_add error: " - "dscp for L3DSR should be less than 64."); + error = clib_error_return(0, "lb_vip_add error: " + "dscp for L3DSR should be less than 64."); goto done; } - if (ip46_prefix_is_ip4(&prefix, plen)) { + if (ip46_prefix_is_ip4(&(args.prefix), (args.plen))) + { if (encap == LB_ENCAP_TYPE_GRE4) - type = LB_VIP_TYPE_IP4_GRE4; + args.type = LB_VIP_TYPE_IP4_GRE4; else if (encap == LB_ENCAP_TYPE_GRE6) - type = LB_VIP_TYPE_IP4_GRE6; + args.type = LB_VIP_TYPE_IP4_GRE6; else if (encap == LB_ENCAP_TYPE_L3DSR) - type = LB_VIP_TYPE_IP4_L3DSR; - } else { + args.type = LB_VIP_TYPE_IP4_L3DSR; + else if (encap == LB_ENCAP_TYPE_NAT4) + args.type = LB_VIP_TYPE_IP4_NAT4; + else if (encap == LB_ENCAP_TYPE_NAT6) + { + error = clib_error_return(0, "currently does not support NAT46"); + goto done; + } + } + else + { if (encap == LB_ENCAP_TYPE_GRE4) - type = LB_VIP_TYPE_IP6_GRE4; + args.type = LB_VIP_TYPE_IP6_GRE4; else if (encap == LB_ENCAP_TYPE_GRE6) - type = LB_VIP_TYPE_IP6_GRE6; - } + args.type = LB_VIP_TYPE_IP6_GRE6; + else if (encap == LB_ENCAP_TYPE_NAT6) + args.type = LB_VIP_TYPE_IP6_NAT6; + else if (encap == LB_ENCAP_TYPE_NAT4) + { + error = clib_error_return(0, "currently does not support NAT64"); + goto done; + } + } lb_garbage_collection(); u32 index; if (!del) { - if ((ret = lb_vip_add(&prefix, plen, type, (u8)(dscp & 0x3F), new_len, &index))) { + if (encap == LB_ENCAP_TYPE_L3DSR) { + args.encap_args.dscp = (u8)(dscp & 0x3F); + } + else if ((encap == LB_ENCAP_TYPE_NAT4) + || (encap == LB_ENCAP_TYPE_NAT6)) + { + args.encap_args.srv_type = (u8) srv_type; + args.encap_args.port = (u16) port; + args.encap_args.target_port = (u16) target_port; + args.encap_args.node_port = (u16) node_port; + } + + if ((ret = lb_vip_add(args, &index))) { error = clib_error_return (0, "lb_vip_add error %d", ret); goto done; } else { vlib_cli_output(vm, "lb_vip_add ok %d", index); } } else { - if ((ret = lb_vip_find_index(&prefix, plen, &index))) { + if ((ret = lb_vip_find_index(&(args.prefix), args.plen, &index))) { error = clib_error_return (0, "lb_vip_find_index error %d", ret); goto done; } else if ((ret = lb_vip_del(index))) { @@ -118,7 +165,10 @@ done: VLIB_CLI_COMMAND (lb_vip_command, static) = { .path = "lb vip", - .short_help = "lb vip [encap (gre6|gre4|l3dsr)] [dscp ] [new_len ] [del]", + .short_help = "lb vip [encap (gre6|gre4|l3dsr|nat4|nat6)] " + "[dscp ] " + "[type (nodeport|clusterip) port target_port node_port ] " + "[new_len ] [del]", .function = lb_vip_command_fn, }; @@ -300,6 +350,99 @@ VLIB_CLI_COMMAND (lb_show_vips_command, static) = .function = lb_show_vips_command_fn, }; +static clib_error_t * +lb_set_interface_nat_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd, + u8 is_nat6) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * error = 0; + u32 * sw_if_index = 0; + u32 * inside_sw_if_indices = 0; + int is_del = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "in %U", unformat_vnet_sw_interface, + vnm, sw_if_index)) + vec_add1 (inside_sw_if_indices, *sw_if_index); + else if (unformat (line_input, "del")) + is_del = 1; + else + { + error = clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + goto done; + } + } + + vec_foreach (sw_if_index, inside_sw_if_indices) + { + if (!is_nat6) + { + if (lb_nat4_interface_add_del (*sw_if_index, is_del)) + { + error = clib_error_return( + 0, "%s %U failed", is_del ? "del" : "add", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, *sw_if_index)); + goto done; + } + } + else + { + if (lb_nat6_interface_add_del (*sw_if_index, is_del)) + { + error = clib_error_return( + 0, "%s %U failed", is_del ? "del" : "add", + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, *sw_if_index)); + goto done; + } + } + } + +done: + unformat_free (line_input); + vec_free (inside_sw_if_indices); + + return error; +} + +static clib_error_t * +lb_set_interface_nat4_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + return lb_set_interface_nat_command_fn(vm, input, cmd, 0); +} + +VLIB_CLI_COMMAND (lb_set_interface_nat4_command, static) = { + .path = "lb set interface nat4", + .function = lb_set_interface_nat4_command_fn, + .short_help = "lb set interface nat4 in [del]", +}; + +static clib_error_t * +lb_set_interface_nat6_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + return lb_set_interface_nat_command_fn(vm, input, cmd, 1); +} + +VLIB_CLI_COMMAND (lb_set_interface_nat6_command, static) = { + .path = "lb set interface nat6", + .function = lb_set_interface_nat6_command_fn, + .short_help = "lb set interface nat6 in [del]", +}; + static clib_error_t * lb_flowtable_flush_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) diff --git a/src/plugins/lb/lb.api b/src/plugins/lb/lb.api index 101cee88ded..a9f05f253c5 100644 --- a/src/plugins/lb/lb.api +++ b/src/plugins/lb/lb.api @@ -3,9 +3,9 @@ option version = "1.0.0"; /** \brief Configure Load-Balancer global parameters @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request - @param ip4_src_address - IPv4 address to be used as source for IPv4 GRE traffic. - @param ip6_src_address - IPv6 address to be used as source for IPv6 GRE traffic. - @param n_sticky_buckets - Number of buckets *per worker thread* in the + @param ip4_src_address - IPv4 address to be used as source for IPv4 traffic(applicable in GRE4/GRE6/NAT4/NAT6 mode only). + @param ip6_src_address - IPv6 address to be used as source for IPv6 traffic(applicable in GRE4/GRE6/NAT4/NAT6 mode only). + @param sticky_buckets_per_core - Number of buckets *per worker thread* in the established flow table (must be power of 2). @param flow_timeout - Time in seconds after which, if no packet is received for a given flow, the flow is removed from the established flow table. @@ -25,8 +25,12 @@ autoreply define lb_conf @param context - sender context, to match reply w/ request @param ip_prefix - IP address (IPv4 in lower order 32 bits). @param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4). - @param encap - Encap is ip4 GRE(0) or ip6 GRE(1) or L3DSR(2). + @param encap - Encap is ip4 GRE(0) or ip6 GRE(1) or L3DSR(2) or NAT4(3) or NAT6(4). @param dscp - DSCP bit corresponding to VIP(applicable in L3DSR mode only). + @param type - service type(applicable in NAT4/NAT6 mode only). + @param port - service port(applicable in NAT4/NAT6 mode only). + @param target_port - Pod's port corresponding to specific service(applicable in NAT4/NAT6 mode only). + @param node_port - Node's port(applicable in NAT4/NAT6 mode only). @param new_flows_table_length - Size of the new connections flow table used for this VIP (must be power of 2). @param is_del - The VIP should be removed. @@ -38,6 +42,10 @@ autoreply define lb_add_del_vip { u8 prefix_length; u8 encap; u8 dscp; + u8 type; + u16 port; + u16 target_port; + u16 node_port; u32 new_flows_table_length; u8 is_del; }; diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index 090d190e08b..e1d4df55a3e 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -17,6 +17,7 @@ #include #include #include +#include //GC runs at most once every so many seconds #define LB_GARBAGE_RUN 60 @@ -36,22 +37,34 @@ const static char * const lb_dpo_gre4_ip4[] = { "lb4-gre4" , NULL }; const static char * const lb_dpo_gre4_ip6[] = { "lb6-gre4" , NULL }; const static char* const * const lb_dpo_gre4_nodes[DPO_PROTO_NUM] = { - [DPO_PROTO_IP4] = lb_dpo_gre4_ip4, - [DPO_PROTO_IP6] = lb_dpo_gre4_ip6, + [DPO_PROTO_IP4] = lb_dpo_gre4_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre4_ip6, }; const static char * const lb_dpo_gre6_ip4[] = { "lb4-gre6" , NULL }; const static char * const lb_dpo_gre6_ip6[] = { "lb6-gre6" , NULL }; const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] = { - [DPO_PROTO_IP4] = lb_dpo_gre6_ip4, - [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, + [DPO_PROTO_IP4] = lb_dpo_gre6_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, }; const static char * const lb_dpo_l3dsr_ip4[] = { "lb4-l3dsr" , NULL }; const static char* const * const lb_dpo_l3dsr_nodes[DPO_PROTO_NUM] = { - [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4, + [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4, + }; + +const static char * const lb_dpo_nat4_ip4[] = { "lb4-nat4" , NULL }; +const static char* const * const lb_dpo_nat4_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_nat4_ip4, + }; + +const static char * const lb_dpo_nat6_ip6[] = { "lb6-nat6" , NULL }; +const static char* const * const lb_dpo_nat6_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP6] = lb_dpo_nat6_ip6, }; u32 lb_hash_time_now(vlib_main_t * vm) @@ -88,6 +101,8 @@ static char *lb_vip_type_strings[] = { [LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6", [LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4", [LB_VIP_TYPE_IP4_L3DSR] = "ip4-l3dsr", + [LB_VIP_TYPE_IP4_NAT4] = "ip4-nat4", + [LB_VIP_TYPE_IP6_NAT6] = "ip6-nat6", }; u8 *format_lb_vip_type (u8 * s, va_list * args) @@ -115,20 +130,39 @@ uword unformat_lb_vip_type (unformat_input_t * input, va_list * args) u8 *format_lb_vip (u8 * s, va_list * args) { lb_vip_t *vip = va_arg (*args, lb_vip_t *); - return format(s, "%U %U new_size:%u #as:%u%s", + s = format(s, "%U %U new_size:%u #as:%u%s", format_lb_vip_type, vip->type, format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY, vip->new_flow_table_mask + 1, pool_elts(vip->as_indexes), (vip->flags & LB_VIP_FLAGS_USED)?"":" removed"); + + if (vip->type == LB_VIP_TYPE_IP4_L3DSR) + { + s = format(s, " dscp:%u", vip->encap_args.dscp); + } + else if ((vip->type == LB_VIP_TYPE_IP4_NAT4) + || (vip->type == LB_VIP_TYPE_IP6_NAT6)) + { + if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP) + s = format (s, " type:clusterip port:%u target_port:%u", + ntohs (vip->encap_args.port), + ntohs (vip->encap_args.target_port)); + else + s = format (s, " type:nodeport node_port:%u target_port:%u", + ntohs (vip->encap_args.node_port), + ntohs (vip->encap_args.target_port)); + } + + return s; } u8 *format_lb_as (u8 * s, va_list * args) { lb_as_t *as = va_arg (*args, lb_as_t *); return format(s, "%U %s", format_ip46_address, - &as->address, IP46_TYPE_ANY, - (as->flags & LB_AS_FLAGS_USED)?"used":"removed"); + &as->address, IP46_TYPE_ANY, + (as->flags & LB_AS_FLAGS_USED)?"used":"removed"); } u8 *format_lb_vip_detailed (u8 * s, va_list * args) @@ -151,7 +185,20 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) { s = format(s, "%U dscp:%u\n", format_white_space, indent, - vip->dscp); + vip->encap_args.dscp); + } + else if ((vip->type == LB_VIP_TYPE_IP4_NAT4) + || (vip->type == LB_VIP_TYPE_IP6_NAT6)) + { + if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP) + s = format (s, "%U type:clusterip port:%u target_port:%u", + format_white_space, indent, ntohs (vip->encap_args.port), + ntohs (vip->encap_args.target_port)); + else + s = format (s, "%U type:nodeport node_port:%u target_port:%u", + format_white_space, indent, + ntohs (vip->encap_args.node_port), + ntohs (vip->encap_args.target_port)); } //Print counters @@ -219,6 +266,11 @@ static int lb_pseudorand_compare(void *a, void *b) static void lb_vip_garbage_collection(lb_vip_t *vip) { lb_main_t *lbm = &lb_main; + lb_snat4_key_t m_key4; + clib_bihash_kv_8_8_t kv4, value4; + lb_snat6_key_t m_key6; + clib_bihash_kv_24_8_t kv6, value6; + lb_snat_mapping_t *m = 0; ASSERT (lbm->writer_lock[0]); u32 now = (u32) vlib_time_now(vlib_get_main()); @@ -231,18 +283,52 @@ static void lb_vip_garbage_collection(lb_vip_t *vip) pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; if (!(as->flags & LB_AS_FLAGS_USED) && //Not used - clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used - (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) - { //Not referenced - fib_entry_child_remove(as->next_hop_fib_entry_index, - as->next_hop_child_index); - fib_table_entry_delete_index(as->next_hop_fib_entry_index, - FIB_SOURCE_RR); - as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID; - - pool_put(vip->as_indexes, as_index); - pool_put(lbm->ass, as); - } + clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used + (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) + { //Not referenced + + if (lb_vip_is_nat4(vip)) { + m_key4.addr = as->address.ip4; + m_key4.port = vip->encap_args.target_port; + m_key4.protocol = 0; + m_key4.fib_index = 0; + + kv4.key = m_key4.as_u64; + if(!clib_bihash_search_8_8(&lbm->mapping_by_as4, &kv4, &value4)) + m = pool_elt_at_index (lbm->snat_mappings, value4.value); + ASSERT (m); + + kv4.value = m - lbm->snat_mappings; + clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 0); + pool_put (lbm->snat_mappings, m); + } else if (lb_vip_is_nat6(vip)) { + m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0]; + m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1]; + m_key6.port = vip->encap_args.target_port; + m_key6.protocol = 0; + m_key6.fib_index = 0; + + kv6.key[0] = m_key6.as_u64[0]; + kv6.key[1] = m_key6.as_u64[1]; + kv6.key[2] = m_key6.as_u64[2]; + + if (!clib_bihash_search_24_8 (&lbm->mapping_by_as6, &kv6, &value6)) + m = pool_elt_at_index (lbm->snat_mappings, value6.value); + ASSERT (m); + + kv6.value = m - lbm->snat_mappings; + clib_bihash_add_del_24_8(&lbm->mapping_by_as6, &kv6, 0); + pool_put (lbm->snat_mappings, m); + } + fib_entry_child_remove(as->next_hop_fib_entry_index, + as->next_hop_child_index); + fib_table_entry_delete_index(as->next_hop_fib_entry_index, + FIB_SOURCE_RR); + as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID; + + pool_put(vip->as_indexes, as_index); + pool_put(lbm->ass, as); + } }); } @@ -453,6 +539,7 @@ int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n) u32 *to_be_updated = 0; u32 i; u32 *ip; + lb_snat_mapping_t *m; //Sanity check while (n--) { @@ -512,26 +599,96 @@ next: */ fib_prefix_t nh = {}; if (lb_encap_is_ip4(vip)) { - nh.fp_addr.ip4 = as->address.ip4; - nh.fp_len = 32; - nh.fp_proto = FIB_PROTOCOL_IP4; + nh.fp_addr.ip4 = as->address.ip4; + nh.fp_len = 32; + nh.fp_proto = FIB_PROTOCOL_IP4; } else { - nh.fp_addr.ip6 = as->address.ip6; - nh.fp_len = 128; - nh.fp_proto = FIB_PROTOCOL_IP6; + nh.fp_addr.ip6 = as->address.ip6; + nh.fp_len = 128; + nh.fp_proto = FIB_PROTOCOL_IP6; } as->next_hop_fib_entry_index = fib_table_entry_special_add(0, - &nh, - FIB_SOURCE_RR, - FIB_ENTRY_FLAG_NONE); + &nh, + FIB_SOURCE_RR, + FIB_ENTRY_FLAG_NONE); as->next_hop_child_index = fib_entry_child_add(as->next_hop_fib_entry_index, - lbm->fib_node_type, - as - lbm->ass); + lbm->fib_node_type, + as - lbm->ass); lb_as_stack(as); + + if ( lb_vip_is_nat4(vip) || lb_vip_is_nat6(vip) ) + { + /* Add SNAT static mapping */ + pool_get (lbm->snat_mappings, m); + memset (m, 0, sizeof (*m)); + if (lb_vip_is_nat4(vip)) { + lb_snat4_key_t m_key4; + clib_bihash_kv_8_8_t kv4; + m_key4.addr = as->address.ip4; + m_key4.port = vip->encap_args.target_port; + m_key4.protocol = 0; + m_key4.fib_index = 0; + + if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP) + { + m->src_ip.ip4 = vip->prefix.ip4; + m->src_port = vip->encap_args.port; + } + else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT) + { + m->src_ip.ip4 = lbm->ip4_src_address; + m->src_port = vip->encap_args.node_port; + } + m->src_ip_is_ipv6 = 0; + m->as_ip.ip4 = as->address.ip4; + m->as_ip_is_ipv6 = 0;; + m->target_port = vip->encap_args.target_port; + m->vrf_id = 0; + m->fib_index = 0; + + kv4.key = m_key4.as_u64; + kv4.value = m - lbm->snat_mappings; + clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 1); + } else { + lb_snat6_key_t m_key6; + clib_bihash_kv_24_8_t kv6; + m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0]; + m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1]; + m_key6.port = vip->encap_args.target_port; + m_key6.protocol = 0; + m_key6.fib_index = 0; + + if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP) + { + m->src_ip.ip6.as_u64[0] = vip->prefix.ip6.as_u64[0]; + m->src_ip.ip6.as_u64[1] = vip->prefix.ip6.as_u64[1]; + m->src_port = vip->encap_args.port; + } + else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT) + { + m->src_ip.ip6.as_u64[0] = lbm->ip6_src_address.as_u64[0]; + m->src_ip.ip6.as_u64[1] = lbm->ip6_src_address.as_u64[1]; + m->src_port = vip->encap_args.node_port; + } + m->src_ip_is_ipv6 = 1; + m->as_ip.ip6.as_u64[0] = as->address.ip6.as_u64[0]; + m->as_ip.ip6.as_u64[1] = as->address.ip6.as_u64[1]; + m->as_ip_is_ipv6 = 1; + m->target_port = vip->encap_args.target_port; + m->vrf_id = 0; + m->fib_index = 0; + + kv6.key[0] = m_key6.as_u64[0]; + kv6.key[1] = m_key6.as_u64[1]; + kv6.key[2] = m_key6.as_u64[2]; + kv6.value = m - lbm->snat_mappings; + clib_bihash_add_del_24_8(&lbm->mapping_by_as6, &kv6, 1); + } + } } vec_free(to_be_added); @@ -631,13 +788,17 @@ static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) dpo_type = lbm->dpo_gre6_type; else if (lb_vip_is_l3dsr(vip)) dpo_type = lbm->dpo_l3dsr_type; + else if(lb_vip_is_nat4(vip)) + dpo_type = lbm->dpo_nat4_type; + else if (lb_vip_is_nat6(vip)) + dpo_type = lbm->dpo_nat6_type; dpo_set(&dpo, dpo_type, proto, vip - lbm->vips); fib_table_entry_special_dpo_add(0, - &pfx, - FIB_SOURCE_PLUGIN_HI, - FIB_ENTRY_FLAG_EXCLUSIVE, - &dpo); + &pfx, + FIB_SOURCE_PLUGIN_HI, + FIB_ENTRY_FLAG_EXCLUSIVE, + &dpo); dpo_reset(&dpo); } @@ -659,37 +820,41 @@ static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip) fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI); } -int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, - u32 new_length, u32 *vip_index) +int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index) { lb_main_t *lbm = &lb_main; + vlib_main_t *vm = vlib_get_main(); lb_vip_t *vip; + lb_vip_type_t type = args.type; + u16 node_port = args.encap_args.node_port; lb_get_writer_lock(); - ip46_prefix_normalize(prefix, plen); + ip46_prefix_normalize(&(args.prefix), args.plen); - if (!lb_vip_find_index_with_lock(prefix, plen, vip_index)) { + if (!lb_vip_find_index_with_lock(&(args.prefix), args.plen, vip_index)) { lb_put_writer_lock(); return VNET_API_ERROR_VALUE_EXIST; } - if (!is_pow2(new_length)) { + if (!is_pow2(args.new_length)) { lb_put_writer_lock(); return VNET_API_ERROR_INVALID_MEMORY_SIZE; } - if (ip46_prefix_is_ip4(prefix, plen) && + if (ip46_prefix_is_ip4(&(args.prefix), args.plen) && (type != LB_VIP_TYPE_IP4_GRE4) && (type != LB_VIP_TYPE_IP4_GRE6) && - (type != LB_VIP_TYPE_IP4_L3DSR)) + (type != LB_VIP_TYPE_IP4_L3DSR) && + (type != LB_VIP_TYPE_IP4_NAT4)) return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; - if ((!ip46_prefix_is_ip4(prefix, plen)) && + if ((!ip46_prefix_is_ip4(&(args.prefix), args.plen)) && (type != LB_VIP_TYPE_IP6_GRE4) && - (type != LB_VIP_TYPE_IP6_GRE6)) + (type != LB_VIP_TYPE_IP6_GRE6) && + (type != LB_VIP_TYPE_IP6_NAT6)) return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; - if ((type == LB_VIP_TYPE_IP4_L3DSR) && (dscp >= 64 ) ) + if ((type == LB_VIP_TYPE_IP4_L3DSR) && (args.encap_args.dscp >= 64 ) ) { return VNET_API_ERROR_VALUE_EXIST; } @@ -698,11 +863,23 @@ int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, pool_get(lbm->vips, vip); //Init - vip->prefix = *prefix; - vip->plen = plen; + memcpy (&(vip->prefix), &(args.prefix), sizeof(args.prefix)); + vip->plen = args.plen; vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main()); - vip->type = type; - vip->dscp = dscp; + vip->type = args.type; + + if (args.type == LB_VIP_TYPE_IP4_L3DSR) { + vip->encap_args.dscp = args.encap_args.dscp; + } + else if ((args.type == LB_VIP_TYPE_IP4_NAT4) + ||(args.type == LB_VIP_TYPE_IP6_NAT6)) { + vip->encap_args.srv_type = args.encap_args.srv_type; + vip->encap_args.port = clib_host_to_net_u16(args.encap_args.port); + vip->encap_args.target_port = + clib_host_to_net_u16(args.encap_args.target_port); + vip->encap_args.node_port = clib_host_to_net_u16(node_port); + } + vip->flags = LB_VIP_FLAGS_USED; vip->as_indexes = 0; @@ -714,7 +891,7 @@ int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, } //Configure new flow table - vip->new_flow_table_mask = new_length - 1; + vip->new_flow_table_mask = args.new_length - 1; vip->new_flow_table = 0; //Create a new flow hash table full of the default entry @@ -723,6 +900,27 @@ int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, //Create adjacency to direct traffic lb_vip_add_adjacency(lbm, vip); + if ( (lb_vip_is_nat4(vip) || lb_vip_is_nat6(vip)) + && (args.encap_args.srv_type == LB_SRV_TYPE_NODEPORT) ) + { + u32 key; + uword * entry; + + //Create maping from nodeport to vip_index + key = clib_host_to_net_u16(node_port); + entry = hash_get_mem (lbm->vip_index_by_nodeport, &key); + if (entry) { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + + hash_set_mem (lbm->vip_index_by_nodeport, &key, vip - lbm->vips); + + /* receive packets destined to NodeIP:NodePort */ + udp_register_dst_port (vm, node_port, lb4_nodeport_node.index, 1); + udp_register_dst_port (vm, node_port, lb6_nodeport_node.index, 0); + } + //Return result *vip_index = vip - lbm->vips; @@ -819,12 +1017,16 @@ lb_as_stack (lb_as_t *as) dpo_type = lbm->dpo_gre6_type; else if (lb_vip_is_l3dsr(vip)) dpo_type = lbm->dpo_l3dsr_type; + else if(lb_vip_is_nat4(vip)) + dpo_type = lbm->dpo_nat4_type; + else if (lb_vip_is_nat6(vip)) + dpo_type = lbm->dpo_nat6_type; dpo_stack(dpo_type, - lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6, - &as->dpo, - fib_entry_contribute_ip_forwarding( - as->next_hop_fib_entry_index)); + lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6, + &as->dpo, + fib_entry_contribute_ip_forwarding( + as->next_hop_fib_entry_index)); } static fib_node_back_walk_rc_t @@ -835,11 +1037,46 @@ lb_fib_node_back_walk_notify (fib_node_t *node, return (FIB_NODE_BACK_WALK_CONTINUE); } +int lb_nat4_interface_add_del (u32 sw_if_index, int is_del) +{ + if (is_del) + { + vnet_feature_enable_disable ("ip4-unicast", "lb-nat4-in2out", + sw_if_index, 0, 0, 0); + } + else + { + vnet_feature_enable_disable ("ip4-unicast", "lb-nat4-in2out", + sw_if_index, 1, 0, 0); + } + + return 0; +} + +int lb_nat6_interface_add_del (u32 sw_if_index, int is_del) +{ + if (is_del) + { + vnet_feature_enable_disable ("ip6-unicast", "lb-nat6-in2out", + sw_if_index, 0, 0, 0); + } + else + { + vnet_feature_enable_disable ("ip6-unicast", "lb-nat6-in2out", + sw_if_index, 1, 0, 0); + } + + return 0; +} + clib_error_t * lb_init (vlib_main_t * vm) { vlib_thread_main_t *tm = vlib_get_thread_main (); lb_main_t *lbm = &lb_main; + lbm->vnet_main = vnet_get_main (); + lbm->vlib_main = vm; + lb_as_t *default_as; fib_node_vft_t lb_fib_node_vft = { .fnv_get = lb_fib_node_get_node, @@ -865,6 +1102,8 @@ lb_init (vlib_main_t * vm) lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes); lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes); lbm->dpo_l3dsr_type = dpo_register_new_type(&lb_vft, lb_dpo_l3dsr_nodes); + lbm->dpo_nat4_type = dpo_register_new_type(&lb_vft, lb_dpo_nat4_nodes); + lbm->dpo_nat6_type = dpo_register_new_type(&lb_vft, lb_dpo_nat6_nodes); lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft); //Init AS reference counters @@ -879,6 +1118,17 @@ lb_init (vlib_main_t * vm) default_as->address.ip6.as_u64[0] = 0xffffffffffffffffL; default_as->address.ip6.as_u64[1] = 0xffffffffffffffffL; + lbm->vip_index_by_nodeport + = hash_create_mem (0, sizeof(u16), sizeof (uword)); + + clib_bihash_init_8_8 (&lbm->mapping_by_as4, + "mapping_by_as4", LB_MAPPING_BUCKETS, + LB_MAPPING_MEMORY_SIZE); + + clib_bihash_init_24_8 (&lbm->mapping_by_as6, + "mapping_by_as6", LB_MAPPING_BUCKETS, + LB_MAPPING_MEMORY_SIZE); + #define _(a,b,c) lbm->vip_counters[c].name = b; lb_foreach_vip_counter #undef _ diff --git a/src/plugins/lb/lb.h b/src/plugins/lb/lb.h index 61d17d713a5..1526298b0fa 100644 --- a/src/plugins/lb/lb.h +++ b/src/plugins/lb/lb.h @@ -38,17 +38,65 @@ #include #include #include - +#include +#include #include #define LB_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10 #define LB_DEFAULT_FLOW_TIMEOUT 40 +#define LB_MAPPING_BUCKETS 1024 +#define LB_MAPPING_MEMORY_SIZE 64<<20 typedef enum { LB_NEXT_DROP, LB_N_NEXT, } lb_next_t; +typedef enum { + LB_NAT4_IN2OUT_NEXT_DROP, + LB_NAT4_IN2OUT_NEXT_LOOKUP, + LB_NAT4_IN2OUT_N_NEXT, +} LB_nat4_in2out_next_t; + +typedef enum { + LB_NAT6_IN2OUT_NEXT_DROP, + LB_NAT6_IN2OUT_NEXT_LOOKUP, + LB_NAT6_IN2OUT_N_NEXT, +} LB_nat6_in2out_next_t; + +#define foreach_lb_nat_in2out_error \ +_(UNSUPPORTED_PROTOCOL, "Unsupported protocol") \ +_(IN2OUT_PACKETS, "Good in2out packets processed") \ +_(NO_TRANSLATION, "No translation") + +typedef enum { +#define _(sym,str) LB_NAT_IN2OUT_ERROR_##sym, + foreach_lb_nat_in2out_error +#undef _ + LB_NAT_IN2OUT_N_ERROR, +} lb_nat_in2out_error_t; + +/** + * lb for kube-proxy supports three types of service + */ +typedef enum { + LB_SRV_TYPE_CLUSTERIP, + LB_SRV_TYPE_NODEPORT, + LB_SRV_N_TYPES, +} lb_svr_type_t; + +typedef enum { + LB4_NODEPORT_NEXT_IP4_NAT4, + LB4_NODEPORT_NEXT_DROP, + LB4_NODEPORT_N_NEXT, +} lb4_nodeport_next_t; + +typedef enum { + LB6_NODEPORT_NEXT_IP6_NAT6, + LB6_NODEPORT_NEXT_DROP, + LB6_NODEPORT_N_NEXT, +} lb6_nodeport_next_t; + /** * Each VIP is configured with a set of * application server. @@ -133,12 +181,14 @@ typedef enum { LB_ENCAP_TYPE_GRE4, LB_ENCAP_TYPE_GRE6, LB_ENCAP_TYPE_L3DSR, + LB_ENCAP_TYPE_NAT4, + LB_ENCAP_TYPE_NAT6, LB_ENCAP_N_TYPES, } lb_encap_type_t; /** * The load balancer supports IPv4 and IPv6 traffic - * and GRE4, GRE6 and L3DSR encap. + * and GRE4, GRE6, L3DSR and NAT4, NAT6 encap. */ typedef enum { LB_VIP_TYPE_IP6_GRE6, @@ -146,13 +196,39 @@ typedef enum { LB_VIP_TYPE_IP4_GRE6, LB_VIP_TYPE_IP4_GRE4, LB_VIP_TYPE_IP4_L3DSR, + LB_VIP_TYPE_IP4_NAT4, + LB_VIP_TYPE_IP6_NAT6, LB_VIP_N_TYPES, } lb_vip_type_t; - format_function_t format_lb_vip_type; unformat_function_t unformat_lb_vip_type; + +/* args for different vip encap types */ +typedef struct { + union + { + struct + { + /* Service type. clusterip or nodeport */ + u8 srv_type; + + /* Service port. network byte order */ + u16 port; + + /* Pod's port corresponding to specific service. network byte order */ + u16 target_port; + + /* Node's port, can access service via NodeIP:node_port. network byte order */ + u16 node_port; + }; + /* DSCP bits for L3DSR */ + u8 dscp; + u64 as_u64; + }; +} lb_vip_encap_args_t; + /** * Load balancing service is provided per VIP. * In this data model, a VIP can be a whole prefix. @@ -205,10 +281,8 @@ typedef struct { */ lb_vip_type_t type; - /** - * DSCP bits for L3DSR - */ - u8 dscp; + /* args for different vip encap types */ + lb_vip_encap_args_t encap_args; /** * Flags related to this VIP. @@ -229,21 +303,100 @@ typedef struct { #define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 \ || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \ - || (vip)->type == LB_VIP_TYPE_IP4_L3DSR ) + || (vip)->type == LB_VIP_TYPE_IP4_L3DSR \ + || (vip)->type == LB_VIP_TYPE_IP4_NAT4 ) #define lb_vip_is_gre4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \ || (vip)->type == LB_VIP_TYPE_IP4_GRE4) + #define lb_vip_is_gre6(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE6 \ || (vip)->type == LB_VIP_TYPE_IP4_GRE6) -#define lb_vip_is_l3dsr(vip) (vip)->type == LB_VIP_TYPE_IP4_L3DSR + +#define lb_vip_is_l3dsr(vip) ((vip)->type == LB_VIP_TYPE_IP4_L3DSR) + +#define lb_vip_is_nat4(vip) ((vip)->type == LB_VIP_TYPE_IP4_NAT4) + +#define lb_vip_is_nat6(vip) ((vip)->type == LB_VIP_TYPE_IP6_NAT6) #define lb_encap_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \ || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \ - || (vip)->type == LB_VIP_TYPE_IP4_L3DSR) + || (vip)->type == LB_VIP_TYPE_IP4_L3DSR \ + || (vip)->type == LB_VIP_TYPE_IP4_NAT4 ) format_function_t format_lb_vip; format_function_t format_lb_vip_detailed; +#define foreach_lb_nat_protocol \ + _(UDP, 0, udp, "udp") \ + _(TCP, 1, tcp, "tcp") + +typedef enum { +#define _(N, i, n, s) LB_NAT_PROTOCOL_##N = i, + foreach_lb_nat_protocol +#undef _ +} lb_nat_protocol_t; + +always_inline u32 +lb_ip_proto_to_nat_proto (u8 ip_proto) +{ + u32 nat_proto = ~0; + + nat_proto = (ip_proto == IP_PROTOCOL_UDP) ? LB_NAT_PROTOCOL_UDP : nat_proto; + nat_proto = (ip_proto == IP_PROTOCOL_TCP) ? LB_NAT_PROTOCOL_TCP : nat_proto; + + return nat_proto; +} + +/* Key for Pod's egress SNAT */ +typedef struct { + union + { + struct + { + ip4_address_t addr; + u16 port; + u16 protocol:3, + fib_index:13; + }; + u64 as_u64; + }; +} lb_snat4_key_t; + +typedef struct +{ + union + { + struct + { + ip6_address_t addr; + u16 port; + u16 protocol; + u32 fib_index; + }; + u64 as_u64[3]; + }; +} lb_snat6_key_t; + +typedef struct { + /** + * for vip + port case, src_ip = vip; + * for node ip + node_port, src_ip = node_ip + */ + ip46_address_t src_ip; + ip46_address_t as_ip; + u8 src_ip_is_ipv6; + u8 as_ip_is_ipv6; + /** + * Network byte order + * for vip + port case, src_port = port; + * for node ip + node_port, src_port = node_port + */ + u16 src_port; + u16 target_port; /* Network byte order */ + u32 vrf_id; + u32 fib_index; +} lb_snat_mapping_t; + typedef struct { /** * Each CPU has its own sticky flow hash table. @@ -273,6 +426,9 @@ typedef struct { */ vlib_refcount_t as_refcount; + /* hash lookup vip_index by key: {u16: nodeport} */ + uword * vip_index_by_nodeport; + /** * Some global data is per-cpu */ @@ -314,23 +470,49 @@ typedef struct { dpo_type_t dpo_gre4_type; dpo_type_t dpo_gre6_type; dpo_type_t dpo_l3dsr_type; + dpo_type_t dpo_nat4_type; + dpo_type_t dpo_nat6_type; /** * Node type for registering to fib changes. */ fib_node_type_t fib_node_type; + /* Find a static mapping by AS IP : target_port */ + clib_bihash_8_8_t mapping_by_as4; + clib_bihash_24_8_t mapping_by_as6; + + /* Static mapping pool */ + lb_snat_mapping_t * snat_mappings; + /** * API dynamically registered base ID. */ u16 msg_id_base; volatile u32 *writer_lock; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; } lb_main_t; +/* args for different vip encap types */ +typedef struct { + ip46_address_t prefix; + u8 plen; + lb_vip_type_t type; + u32 new_length; + lb_vip_encap_args_t encap_args; +} lb_vip_add_args_t; + extern lb_main_t lb_main; -extern vlib_node_registration_t lb6_node; extern vlib_node_registration_t lb4_node; +extern vlib_node_registration_t lb6_node; +extern vlib_node_registration_t lb4_nodeport_node; +extern vlib_node_registration_t lb6_nodeport_node; +extern vlib_node_registration_t lb_nat4_in2out_node; +extern vlib_node_registration_t lb_nat6_in2out_node; /** * Fix global load-balancer parameters. @@ -341,8 +523,8 @@ extern vlib_node_registration_t lb4_node; int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, u32 sticky_buckets, u32 flow_timeout); -int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u8 dscp, - u32 new_length, u32 *vip_index); +int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index); + int lb_vip_del(u32 vip_index); int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index); @@ -356,6 +538,9 @@ u32 lb_hash_time_now(vlib_main_t * vm); void lb_garbage_collection(); +int lb_nat4_interface_add_del (u32 sw_if_index, int is_del); +int lb_nat6_interface_add_del (u32 sw_if_index, int is_del); + format_function_t format_lb_main; #endif /* LB_PLUGIN_LB_LB_H_ */ diff --git a/src/plugins/lb/lb_plugin_doc.md b/src/plugins/lb/lb_plugin_doc.md index 7672b1e88d7..25a4cfa11df 100644 --- a/src/plugins/lb/lb_plugin_doc.md +++ b/src/plugins/lb/lb_plugin_doc.md @@ -29,6 +29,32 @@ Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must be using the same encap. type (i.e. IPv4+GRE or IPv6+GRE or IPv4+L3DSR). Meaning that for a given VIP, all AS addresses must be of the same family. +3). IPv4/IPv6 + NAT4/NAT6 encap types: +This type provides kube-proxy data plane on user space, +which is used to replace linux kernal's kube-proxy based on iptables. + +Currently, load balancer plugin supports three service types: +a) Cluster IP plus Port: support any protocols, including TCP, UDP. +b) Node IP plus Node Port: currently only support UDP. +c) External Load Balancer. + +For Cluster IP plus Port case: +kube-proxy is configured with a set of Virtual IPs (VIP, which can be +prefixes), and for each VIP, with a set of AS addresses (ASs). + +For a specific session received for a given VIP (or VIP prefix), +first packet selects a AS according to internal load balancing algorithm, +then does DNAT operation and sent to chosen AS. +At the same time, will create a session entry to store AS chosen result. +Following packets for that session will look up session table first, +which ensures that a given session will always be routed to the same AS. + +For returned packet from AS, it will do SNAT operation and sent out. + +Please refer to below for details: +https://schd.ws/hosted_files/ossna2017/1e/VPP_K8S_GTPU_OSSNA.pdf + + ## Performances The load balancer has been tested up to 1 millions flows and still forwards more @@ -45,9 +71,11 @@ The load balancer needs to be configured with some parameters: lb conf [ip4-src-address ] [ip6-src-address ] [buckets ] [timeout ] -ip4-src-address: the source address used to send encap. packets using IPv4. +ip4-src-address: the source address used to send encap. packets using IPv4 for GRE4 mode. + or Node IP4 address for NAT4 mode. -ip6-src-address: the source address used to send encap. packets using IPv6. +ip6-src-address: the source address used to send encap. packets using IPv6 for GRE6 mode. + or Node IP6 address for NAT6 mode. buckets: the *per-thread* established-connexions-table number of buckets. @@ -57,13 +85,15 @@ timeout: the number of seconds a connection will remain in the ### Configure the VIPs - lb vip [encap (gre6|gre4|l3dsr)] [dscp ] [new_len ] [del] + lb vip [encap (gre6|gre4|l3dsr|nat4|nat6)] \ + [dscp ] [port target_port node_port ] [new_len ] [del] new_len is the size of the new-connection-table. It should be 1 or 2 orders of magnitude bigger than the number of ASs for the VIP in order to ensure a good load balancing. Encap l3dsr and dscp is used to map VIP to dscp bit and rewrite DSCP bit in packets. So the selected server could get VIP from DSCP bit in this packet and perform DSR. +Encap nat4/nat6 and port/target_port/node_port is used to do kube-proxy data plane. Examples: @@ -72,6 +102,8 @@ Examples: lb vip 80.0.0.0/8 encap gre6 new_len 16 lb vip 90.0.0.0/8 encap gre4 new_len 1024 lb vip 100.0.0.0/8 encap l3dsr dscp 2 new_len 32 + lb vip 90.1.2.1/32 encap nat4 port 3306 target_port 3307 node_port 30964 new_len 1024 + lb vip 2004::/16 encap nat6 port 6306 target_port 6307 node_port 30966 new_len 1024 ### Configure the ASs (for each VIP) @@ -86,8 +118,18 @@ Examples: lb as 2003::/16 10.0.0.1 10.0.0.2 lb as 80.0.0.0/8 2001::2 lb as 90.0.0.0/8 10.0.0.1 - - + +### Configure SNAT + + lb set interface nat4 in [del] + +Set SNAT feature in a specific interface. +(applicable in NAT4 mode only) + + lb set interface nat6 in [del] + +Set SNAT feature in a specific interface. +(applicable in NAT6 mode only) ## Monitoring @@ -97,7 +139,7 @@ These are still subject to quite significant changes. show lb show lb vip show lb vip verbose - + show node counters @@ -105,9 +147,9 @@ These are still subject to quite significant changes. ### Multi-Threading -MagLev is a distributed system which pseudo-randomly generates a -new-connections-table based on AS names such that each server configured with -the same set of ASs ends up with the same table. Connection stickyness is then +MagLev is a distributed system which pseudo-randomly generates a +new-connections-table based on AS names such that each server configured with +the same set of ASs ends up with the same table. Connection stickyness is then ensured with an established-connections-table. Using ECMP, it is assumed (but not relied on) that servers will mostly receive traffic for different flows. @@ -133,8 +175,8 @@ When an AS is removed, there is two possible ways to react. - Keep using the AS for established connections - Change AS for established connections (likely to cause error for TCP) -In the first case, although an AS is removed from the configuration, its -associated state needs to stay around as long as it is used by at least one +In the first case, although an AS is removed from the configuration, its +associated state needs to stay around as long as it is used by at least one thread. In order to avoid locks, a specific reference counter is used. The design is quite diff --git a/src/plugins/lb/lb_test.c b/src/plugins/lb/lb_test.c index b02793944c5..fc498706222 100644 --- a/src/plugins/lb/lb_test.c +++ b/src/plugins/lb/lb_test.c @@ -171,6 +171,10 @@ static int api_lb_add_del_vip (vat_main_t * vam) mps.encap = LB_ENCAP_TYPE_GRE6; } else if (unformat(i, "l3dsr")) { mps.encap = LB_ENCAP_TYPE_L3DSR; + } else if (unformat(i, "nat4")) { + mps.encap = LB_ENCAP_TYPE_NAT4; + } else if (unformat(i, "nat6")) { + mps.encap = LB_ENCAP_TYPE_NAT6; } else { errmsg ("no encap\n"); return -99; @@ -221,7 +225,9 @@ static int api_lb_add_del_as (vat_main_t * vam) */ #define foreach_vpe_api_msg \ _(lb_conf, " ") \ -_(lb_add_del_vip, " [gre4|gre6] [del]") \ +_(lb_add_del_vip, " [gre4|gre6|l3dsr|nat4|nat6] " \ + " " \ + " [del]") \ _(lb_add_del_as, "
[del]") static void diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c index 529da73bcff..e19964d2f1c 100644 --- a/src/plugins/lb/node.c +++ b/src/plugins/lb/node.c @@ -14,6 +14,7 @@ */ #include +#include #include #include @@ -22,69 +23,136 @@ _(NONE, "no error") \ _(PROTO_NOT_SUPPORTED, "protocol not supported") -typedef enum { +typedef enum +{ #define _(sym,str) LB_ERROR_##sym, foreach_lb_error #undef _ - LB_N_ERROR, + LB_N_ERROR, } lb_error_t; -static char *lb_error_strings[] = { +static char *lb_error_strings[] = + { #define _(sym,string) string, - foreach_lb_error + foreach_lb_error #undef _ -}; + }; -typedef struct { +typedef struct +{ u32 vip_index; u32 as_index; } lb_trace_t; +typedef struct +{ + u32 vip_index; + + u32 node_port; +} lb_nodeport_trace_t; + +typedef struct +{ + u32 vip_index; + u32 as_index; + u32 rx_sw_if_index; + u32 next_index; +} lb_nat_trace_t; + u8 * format_lb_trace (u8 * s, va_list * args) { lb_main_t *lbm = &lb_main; - CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); - CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + CLIB_UNUSED(vlib_main_t * vm) += va_arg (*args, vlib_main_t *); + CLIB_UNUSED(vlib_node_t * node) + = va_arg (*args, vlib_node_t *); lb_trace_t *t = va_arg (*args, lb_trace_t *); - if (pool_is_free_index(lbm->vips, t->vip_index)) { - s = format(s, "lb vip[%d]: This VIP was freed since capture\n"); - } else { - s = format(s, "lb vip[%d]: %U\n", t->vip_index, format_lb_vip, &lbm->vips[t->vip_index]); - } - if (pool_is_free_index(lbm->ass, t->as_index)) { - s = format(s, "lb as[%d]: This AS was freed since capture\n"); - } else { - s = format(s, "lb as[%d]: %U\n", t->as_index, format_lb_as, &lbm->ass[t->as_index]); - } + if (pool_is_free_index(lbm->vips, t->vip_index)) + { + s = format (s, "lb vip[%d]: This VIP was freed since capture\n"); + } + else + { + s = format (s, "lb vip[%d]: %U\n", t->vip_index, format_lb_vip, + &lbm->vips[t->vip_index]); + } + if (pool_is_free_index(lbm->ass, t->as_index)) + { + s = format (s, "lb as[%d]: This AS was freed since capture\n"); + } + else + { + s = format (s, "lb as[%d]: %U\n", t->as_index, format_lb_as, + &lbm->ass[t->as_index]); + } return s; } -lb_hash_t *lb_get_sticky_table(u32 thread_index) +u8 * +format_lb_nat_trace (u8 * s, va_list * args) +{ + lb_main_t *lbm = &lb_main; + CLIB_UNUSED(vlib_main_t * vm) += va_arg (*args, vlib_main_t *); + CLIB_UNUSED(vlib_node_t * node) + = va_arg (*args, vlib_node_t *); + lb_nat_trace_t *t = va_arg (*args, lb_nat_trace_t *); + + if (pool_is_free_index(lbm->vips, t->vip_index)) + { + s = format (s, "lb vip[%d]: This VIP was freed since capture\n"); + } + else + { + s = format (s, "lb vip[%d]: %U\n", t->vip_index, format_lb_vip, + &lbm->vips[t->vip_index]); + } + if (pool_is_free_index(lbm->ass, t->as_index)) + { + s = format (s, "lb as[%d]: This AS was freed since capture\n"); + } + else + { + s = format (s, "lb as[%d]: %U\n", t->as_index, format_lb_as, + &lbm->ass[t->as_index]); + } + s = format (s, "lb nat: rx_sw_if_index = %d, next_index = %d", + t->rx_sw_if_index, t->next_index); + + return s; +} + +lb_hash_t * +lb_get_sticky_table (u32 thread_index) { lb_main_t *lbm = &lb_main; lb_hash_t *sticky_ht = lbm->per_cpu[thread_index].sticky_ht; //Check if size changed - if (PREDICT_FALSE(sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht)))) + if (PREDICT_FALSE( + sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht)))) { //Dereference everything in there lb_hash_bucket_t *b; u32 i; - lb_hash_foreach_entry(sticky_ht, b, i) { - vlib_refcount_add(&lbm->as_refcount, thread_index, b->value[i], -1); - vlib_refcount_add(&lbm->as_refcount, thread_index, 0, 1); - } + lb_hash_foreach_entry(sticky_ht, b, i) + { + vlib_refcount_add (&lbm->as_refcount, thread_index, b->value[i], -1); + vlib_refcount_add (&lbm->as_refcount, thread_index, 0, 1); + } - lb_hash_free(sticky_ht); + lb_hash_free (sticky_ht); sticky_ht = NULL; } //Create if necessary - if (PREDICT_FALSE(sticky_ht == NULL)) { - lbm->per_cpu[thread_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); - sticky_ht = lbm->per_cpu[thread_index].sticky_ht; - clib_warning("Regenerated sticky table %p", sticky_ht); - } + if (PREDICT_FALSE(sticky_ht == NULL)) + { + lbm->per_cpu[thread_index].sticky_ht = lb_hash_alloc ( + lbm->per_cpu_sticky_buckets, lbm->flow_timeout); + sticky_ht = lbm->per_cpu[thread_index].sticky_ht; + clib_warning("Regenerated sticky table %p", sticky_ht); + } ASSERT(sticky_ht); @@ -94,19 +162,19 @@ lb_hash_t *lb_get_sticky_table(u32 thread_index) } u64 -lb_node_get_other_ports4(ip4_header_t *ip40) +lb_node_get_other_ports4 (ip4_header_t *ip40) { return 0; } u64 -lb_node_get_other_ports6(ip6_header_t *ip60) +lb_node_get_other_ports6 (ip6_header_t *ip60) { return 0; } static_always_inline u32 -lb_node_get_hash(vlib_buffer_t *p, u8 is_input_v4) +lb_node_get_hash (vlib_buffer_t *p, u8 is_input_v4) { u32 hash; if (is_input_v4) @@ -114,347 +182,946 @@ lb_node_get_hash(vlib_buffer_t *p, u8 is_input_v4) ip4_header_t *ip40; u64 ports; ip40 = vlib_buffer_get_current (p); - if (PREDICT_TRUE (ip40->protocol == IP_PROTOCOL_TCP || - ip40->protocol == IP_PROTOCOL_UDP)) - ports = ((u64)((udp_header_t *)(ip40 + 1))->src_port << 16) | - ((u64)((udp_header_t *)(ip40 + 1))->dst_port); + if (PREDICT_TRUE( + ip40->protocol == IP_PROTOCOL_TCP + || ip40->protocol == IP_PROTOCOL_UDP)) + ports = ((u64) ((udp_header_t *) (ip40 + 1))->src_port << 16) + | ((u64) ((udp_header_t *) (ip40 + 1))->dst_port); else - ports = lb_node_get_other_ports4(ip40); + ports = lb_node_get_other_ports4 (ip40); - hash = lb_hash_hash(*((u64 *)&ip40->address_pair), ports, - 0, 0, 0); + hash = lb_hash_hash (*((u64 *) &ip40->address_pair), ports, 0, 0, 0); } else { ip6_header_t *ip60; ip60 = vlib_buffer_get_current (p); u64 ports; - if (PREDICT_TRUE (ip60->protocol == IP_PROTOCOL_TCP || - ip60->protocol == IP_PROTOCOL_UDP)) - ports = ((u64)((udp_header_t *)(ip60 + 1))->src_port << 16) | - ((u64)((udp_header_t *)(ip60 + 1))->dst_port); + if (PREDICT_TRUE( + ip60->protocol == IP_PROTOCOL_TCP + || ip60->protocol == IP_PROTOCOL_UDP)) + ports = ((u64) ((udp_header_t *) (ip60 + 1))->src_port << 16) + | ((u64) ((udp_header_t *) (ip60 + 1))->dst_port); else - ports = lb_node_get_other_ports6(ip60); + ports = lb_node_get_other_ports6 (ip60); - hash = lb_hash_hash(ip60->src_address.as_u64[0], - ip60->src_address.as_u64[1], - ip60->dst_address.as_u64[0], - ip60->dst_address.as_u64[1], - ports); + hash = lb_hash_hash (ip60->src_address.as_u64[0], + ip60->src_address.as_u64[1], + ip60->dst_address.as_u64[0], + ip60->dst_address.as_u64[1], ports); } return hash; } static_always_inline uword -lb_node_fn (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * frame, - u8 is_input_v4, //Compile-time parameter stating that is input is v4 (or v6) - lb_encap_type_t encap_type) //Compile-time parameter stating that is GRE4 or GRE6 or L3DSR +lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, + u8 is_input_v4, //Compile-time parameter stating that is input is v4 (or v6) + lb_encap_type_t encap_type) //Compile-time parameter is GRE4/GRE6/L3DSR/NAT4/NAT6 { lb_main_t *lbm = &lb_main; u32 n_left_from, *from, next_index, *to_next, n_left_to_next; - u32 thread_index = vlib_get_thread_index(); - u32 lb_time = lb_hash_time_now(vm); + u32 thread_index = vlib_get_thread_index (); + u32 lb_time = lb_hash_time_now (vm); - lb_hash_t *sticky_ht = lb_get_sticky_table(thread_index); + lb_hash_t *sticky_ht = lb_get_sticky_table (thread_index); from = vlib_frame_vector_args (frame); n_left_from = frame->n_vectors; next_index = node->cached_next_index; u32 nexthash0 = 0; if (PREDICT_TRUE(n_left_from > 0)) - nexthash0 = lb_node_get_hash(vlib_get_buffer (vm, from[0]), is_input_v4); + nexthash0 = lb_node_get_hash (vlib_get_buffer (vm, from[0]), is_input_v4); while (n_left_from > 0) - { - vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); - while (n_left_from > 0 && n_left_to_next > 0) { - u32 pi0; - vlib_buffer_t *p0; - lb_vip_t *vip0; - u32 asindex0; - u16 len0; - u32 available_index0; - u8 counter = 0; - u32 hash0 = nexthash0; - - if (PREDICT_TRUE(n_left_from > 1)) - { - vlib_buffer_t *p1 = vlib_get_buffer (vm, from[1]); - //Compute next hash and prefetch bucket - nexthash0 = lb_node_get_hash(p1, is_input_v4); - lb_hash_prefetch_bucket(sticky_ht, nexthash0); - //Prefetch for encap, next - CLIB_PREFETCH (vlib_buffer_get_current(p1) - 64, 64, STORE); - } - - if (PREDICT_TRUE(n_left_from > 2)) - { - vlib_buffer_t *p2; - p2 = vlib_get_buffer(vm, from[2]); - /* prefetch packet header and data */ - vlib_prefetch_buffer_header(p2, STORE); - CLIB_PREFETCH (vlib_buffer_get_current(p2), 64, STORE); - } - - pi0 = to_next[0] = from[0]; - from += 1; - n_left_from -= 1; - to_next += 1; - n_left_to_next -= 1; - - p0 = vlib_get_buffer (vm, pi0); - vip0 = pool_elt_at_index (lbm->vips, - vnet_buffer (p0)->ip.adj_index[VLIB_TX]); - - if (is_input_v4) - { - ip4_header_t *ip40; - ip40 = vlib_buffer_get_current (p0); - len0 = clib_net_to_host_u16(ip40->length); - } - else - { - ip6_header_t *ip60; - ip60 = vlib_buffer_get_current (p0); - len0 = clib_net_to_host_u16(ip60->payload_length) + sizeof(ip6_header_t); - } - - lb_hash_get(sticky_ht, hash0, vnet_buffer (p0)->ip.adj_index[VLIB_TX], - lb_time, &available_index0, &asindex0); - - if (PREDICT_TRUE(asindex0 != ~0)) - { - //Found an existing entry - counter = LB_VIP_COUNTER_NEXT_PACKET; - } - else if (PREDICT_TRUE(available_index0 != ~0)) - { - //There is an available slot for a new flow - asindex0 = vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index; - counter = LB_VIP_COUNTER_FIRST_PACKET; - counter = (asindex0 == 0)?LB_VIP_COUNTER_NO_SERVER:counter; - - //TODO: There are race conditions with as0 and vip0 manipulation. - //Configuration may be changed, vectors resized, etc... - - //Dereference previously used - vlib_refcount_add(&lbm->as_refcount, thread_index, - lb_hash_available_value(sticky_ht, hash0, available_index0), -1); - vlib_refcount_add(&lbm->as_refcount, thread_index, - asindex0, 1); - - //Add sticky entry - //Note that when there is no AS configured, an entry is configured anyway. - //But no configured AS is not something that should happen - lb_hash_put(sticky_ht, hash0, asindex0, - vnet_buffer (p0)->ip.adj_index[VLIB_TX], - available_index0, lb_time); - } - else - { - //Could not store new entry in the table - asindex0 = vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index; - counter = LB_VIP_COUNTER_UNTRACKED_PACKET; - } - - vlib_increment_simple_counter(&lbm->vip_counters[counter], - thread_index, - vnet_buffer (p0)->ip.adj_index[VLIB_TX], - 1); - - //Now let's encap - if ( (encap_type == LB_ENCAP_TYPE_GRE4) - || (encap_type == LB_ENCAP_TYPE_GRE6) ) + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + while (n_left_from > 0 && n_left_to_next > 0) { - gre_header_t *gre0; - if (encap_type == LB_ENCAP_TYPE_GRE4) /* encap GRE4*/ - { - ip4_header_t *ip40; - vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t)); - ip40 = vlib_buffer_get_current(p0); - gre0 = (gre_header_t *)(ip40 + 1); - ip40->src_address = lbm->ip4_src_address; - ip40->dst_address = lbm->ass[asindex0].address.ip4; - ip40->ip_version_and_header_length = 0x45; - ip40->ttl = 128; - ip40->fragment_id = 0; - ip40->flags_and_fragment_offset = 0; - ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t)); - ip40->protocol = IP_PROTOCOL_GRE; - ip40->checksum = ip4_header_checksum (ip40); - } - else /* encap GRE6*/ - { - ip6_header_t *ip60; - vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t)); - ip60 = vlib_buffer_get_current(p0); - gre0 = (gre_header_t *)(ip60 + 1); - ip60->dst_address = lbm->ass[asindex0].address.ip6; - ip60->src_address = lbm->ip6_src_address; - ip60->hop_limit = 128; - ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28); - ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t)); - ip60->protocol = IP_PROTOCOL_GRE; - } - - gre0->flags_and_version = 0; - gre0->protocol = (is_input_v4)? - clib_host_to_net_u16(0x0800): - clib_host_to_net_u16(0x86DD); - } else if (encap_type == LB_ENCAP_TYPE_L3DSR) /* encap L3DSR*/ - { - ip4_header_t *ip40; - tcp_header_t *th0; - - ip40 = vlib_buffer_get_current(p0); - ip40->dst_address = lbm->ass[asindex0].address.ip4; - /* Get and rewrite DSCP bit */ - ip40->tos = (u8)((vip0->dscp & 0x3F)<<2); - ip40->checksum = ip4_header_checksum (ip40); - /* Recomputing L4 checksum after dst-IP modifying */ - th0 = ip4_next_header(ip40); - th0->checksum = 0; - th0->checksum = ip4_tcp_udp_compute_checksum(vm, p0, ip40); - } - - if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) - { - lb_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof (*tr)); - tr->as_index = asindex0; - tr->vip_index = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; - } - - //Enqueue to next - //Note that this is going to error if asindex0 == 0 - vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbm->ass[asindex0].dpo.dpoi_index; - vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, - n_left_to_next, pi0, - lbm->ass[asindex0].dpo.dpoi_next_node); + u32 pi0; + vlib_buffer_t *p0; + lb_vip_t *vip0; + u32 asindex0; + u16 len0; + u32 available_index0; + u8 counter = 0; + u32 hash0 = nexthash0; + + if (PREDICT_TRUE(n_left_from > 1)) + { + vlib_buffer_t *p1 = vlib_get_buffer (vm, from[1]); + //Compute next hash and prefetch bucket + nexthash0 = lb_node_get_hash (p1, is_input_v4); + lb_hash_prefetch_bucket (sticky_ht, nexthash0); + //Prefetch for encap, next + CLIB_PREFETCH(vlib_buffer_get_current (p1) - 64, 64, STORE); + } + + if (PREDICT_TRUE(n_left_from > 2)) + { + vlib_buffer_t *p2; + p2 = vlib_get_buffer (vm, from[2]); + /* prefetch packet header and data */ + vlib_prefetch_buffer_header(p2, STORE); + CLIB_PREFETCH(vlib_buffer_get_current (p2), 64, STORE); + } + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + vip0 = pool_elt_at_index(lbm->vips, + vnet_buffer (p0)->ip.adj_index[VLIB_TX]); + + if (is_input_v4) + { + ip4_header_t *ip40; + ip40 = vlib_buffer_get_current (p0); + len0 = clib_net_to_host_u16 (ip40->length); + } + else + { + ip6_header_t *ip60; + ip60 = vlib_buffer_get_current (p0); + len0 = clib_net_to_host_u16 (ip60->payload_length) + + sizeof(ip6_header_t); + } + + lb_hash_get (sticky_ht, hash0, + vnet_buffer (p0)->ip.adj_index[VLIB_TX], lb_time, + &available_index0, &asindex0); + + if (PREDICT_TRUE(asindex0 != ~0)) + { + //Found an existing entry + counter = LB_VIP_COUNTER_NEXT_PACKET; + } + else if (PREDICT_TRUE(available_index0 != ~0)) + { + //There is an available slot for a new flow + asindex0 = + vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index; + counter = LB_VIP_COUNTER_FIRST_PACKET; + counter = (asindex0 == 0) ? LB_VIP_COUNTER_NO_SERVER : counter; + + //TODO: There are race conditions with as0 and vip0 manipulation. + //Configuration may be changed, vectors resized, etc... + + //Dereference previously used + vlib_refcount_add ( + &lbm->as_refcount, thread_index, + lb_hash_available_value (sticky_ht, hash0, available_index0), + -1); + vlib_refcount_add (&lbm->as_refcount, thread_index, asindex0, 1); + + //Add sticky entry + //Note that when there is no AS configured, an entry is configured anyway. + //But no configured AS is not something that should happen + lb_hash_put (sticky_ht, hash0, asindex0, + vnet_buffer (p0)->ip.adj_index[VLIB_TX], + available_index0, lb_time); + } + else + { + //Could not store new entry in the table + asindex0 = + vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index; + counter = LB_VIP_COUNTER_UNTRACKED_PACKET; + } + + vlib_increment_simple_counter ( + &lbm->vip_counters[counter], thread_index, + vnet_buffer (p0)->ip.adj_index[VLIB_TX], + 1); + + //Now let's encap + if ((encap_type == LB_ENCAP_TYPE_GRE4) + || (encap_type == LB_ENCAP_TYPE_GRE6)) + { + gre_header_t *gre0; + if (encap_type == LB_ENCAP_TYPE_GRE4) /* encap GRE4*/ + { + ip4_header_t *ip40; + vlib_buffer_advance ( + p0, -sizeof(ip4_header_t) - sizeof(gre_header_t)); + ip40 = vlib_buffer_get_current (p0); + gre0 = (gre_header_t *) (ip40 + 1); + ip40->src_address = lbm->ip4_src_address; + ip40->dst_address = lbm->ass[asindex0].address.ip4; + ip40->ip_version_and_header_length = 0x45; + ip40->ttl = 128; + ip40->fragment_id = 0; + ip40->flags_and_fragment_offset = 0; + ip40->length = clib_host_to_net_u16 ( + len0 + sizeof(gre_header_t) + sizeof(ip4_header_t)); + ip40->protocol = IP_PROTOCOL_GRE; + ip40->checksum = ip4_header_checksum (ip40); + } + else /* encap GRE6*/ + { + ip6_header_t *ip60; + vlib_buffer_advance ( + p0, -sizeof(ip6_header_t) - sizeof(gre_header_t)); + ip60 = vlib_buffer_get_current (p0); + gre0 = (gre_header_t *) (ip60 + 1); + ip60->dst_address = lbm->ass[asindex0].address.ip6; + ip60->src_address = lbm->ip6_src_address; + ip60->hop_limit = 128; + ip60->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32 (0x6 << 28); + ip60->payload_length = clib_host_to_net_u16 ( + len0 + sizeof(gre_header_t)); + ip60->protocol = IP_PROTOCOL_GRE; + } + + gre0->flags_and_version = 0; + gre0->protocol = + (is_input_v4) ? + clib_host_to_net_u16 (0x0800) : + clib_host_to_net_u16 (0x86DD); + } + else if (encap_type == LB_ENCAP_TYPE_L3DSR) /* encap L3DSR*/ + { + ip4_header_t *ip40; + tcp_header_t *th0; + ip_csum_t csum; + u32 old_dst; + u32 old_dscp; + + ip40 = vlib_buffer_get_current (p0); + old_dst = ip40->dst_address.as_u32; + old_dscp = ip40->tos; + ip40->dst_address = lbm->ass[asindex0].address.ip4; + /* Get and rewrite DSCP bit */ + ip40->tos = (u8) ((vip0->encap_args.dscp & 0x3F) << 2); + + csum = ip40->checksum; + csum = ip_csum_sub_even (csum, old_dst); + csum = ip_csum_sub_even (csum, old_dscp); + csum = ip_csum_add_even (csum, + lbm->ass[asindex0].address.ip4.as_u32); + csum = ip_csum_add_even (csum, ip40->tos); + ip40->checksum = ip_csum_fold (csum); + + /* Recomputing L4 checksum after dst-IP modifying */ + th0 = ip4_next_header (ip40); + th0->checksum = 0; + th0->checksum = ip4_tcp_udp_compute_checksum (vm, p0, ip40); + } + else if ((encap_type == LB_ENCAP_TYPE_NAT4) + || (encap_type == LB_ENCAP_TYPE_NAT6)) + { + ip_csum_t csum; + udp_header_t *uh; + + /* do NAT */ + if ((is_input_v4 == 1) && (encap_type == LB_ENCAP_TYPE_NAT4)) + { + /* NAT44 */ + ip4_header_t *ip40; + u32 old_dst; + ip40 = vlib_buffer_get_current (p0); + uh = (udp_header_t *) (ip40 + 1); + old_dst = ip40->dst_address.as_u32; + ip40->dst_address = lbm->ass[asindex0].address.ip4; + + csum = ip40->checksum; + csum = ip_csum_sub_even (csum, old_dst); + csum = ip_csum_add_even ( + csum, lbm->ass[asindex0].address.ip4.as_u32); + ip40->checksum = ip_csum_fold (csum); + + if ((ip40->protocol == IP_PROTOCOL_UDP) + || (uh->dst_port == vip0->encap_args.port)) + { + uh->dst_port = vip0->encap_args.target_port; + csum = uh->checksum; + csum = ip_csum_sub_even (csum, old_dst); + csum = ip_csum_add_even ( + csum, lbm->ass[asindex0].address.ip4.as_u32); + uh->checksum = ip_csum_fold (csum); + } + else + { + next_index = LB_NEXT_DROP; + } + } + else if ((is_input_v4 == 0) && (encap_type == LB_ENCAP_TYPE_NAT6)) + { + /* NAT66 */ + ip6_header_t *ip60; + ip6_address_t old_dst; + + ip60 = vlib_buffer_get_current (p0); + uh = (udp_header_t *) (ip60 + 1); + + old_dst.as_u64[0] = ip60->dst_address.as_u64[0]; + old_dst.as_u64[1] = ip60->dst_address.as_u64[1]; + ip60->dst_address.as_u64[0] = + lbm->ass[asindex0].address.ip6.as_u64[0]; + ip60->dst_address.as_u64[1] = + lbm->ass[asindex0].address.ip6.as_u64[1]; + + if (PREDICT_TRUE(ip60->protocol == IP_PROTOCOL_UDP)) + { + uh->dst_port = vip0->encap_args.target_port; + csum = uh->checksum; + csum = ip_csum_sub_even (csum, old_dst.as_u64[0]); + csum = ip_csum_sub_even (csum, old_dst.as_u64[1]); + csum = ip_csum_add_even ( + csum, lbm->ass[asindex0].address.ip6.as_u64[0]); + csum = ip_csum_add_even ( + csum, lbm->ass[asindex0].address.ip6.as_u64[1]); + uh->checksum = ip_csum_fold (csum); + } + else + { + next_index = LB_NEXT_DROP; + } + } + } + + if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) + { + lb_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof(*tr)); + tr->as_index = asindex0; + tr->vip_index = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + } + + //Enqueue to next + //Note that this is going to error if asindex0 == 0 + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = + lbm->ass[asindex0].dpo.dpoi_index; + vlib_validate_buffer_enqueue_x1( + vm, node, next_index, to_next, n_left_to_next, pi0, + lbm->ass[asindex0].dpo.dpoi_next_node); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); } - vlib_put_next_frame (vm, node, next_index, n_left_to_next); - } return frame->n_vectors; } +u8 * +format_nodeport_lb_trace (u8 * s, va_list * args) +{ + lb_main_t *lbm = &lb_main; + CLIB_UNUSED(vlib_main_t * vm) += va_arg (*args, vlib_main_t *); + CLIB_UNUSED(vlib_node_t * node) + = va_arg (*args, vlib_node_t *); + lb_nodeport_trace_t *t = va_arg (*args, lb_nodeport_trace_t *); + if (pool_is_free_index(lbm->vips, t->vip_index)) + { + s = format (s, "lb vip[%d]: This VIP was freed since capture\n"); + } + else + { + s = format (s, "lb vip[%d]: %U\n", t->vip_index, format_lb_vip, + &lbm->vips[t->vip_index]); + } + + s = format (s, " lb node_port: %d", t->node_port); + + return s; +} + static uword -lb6_gre6_node_fn (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * frame) +lb_nodeport_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, u8 is_input_v4) { - return lb_node_fn(vm, node, frame, 0, LB_ENCAP_TYPE_GRE6); + lb_main_t *lbm = &lb_main; + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 pi0; + vlib_buffer_t *p0; + udp_header_t * udp_0; + uword * entry0; + + if (PREDICT_TRUE(n_left_from > 1)) + { + vlib_buffer_t *p1 = vlib_get_buffer (vm, from[1]); + //Prefetch for encap, next + CLIB_PREFETCH(vlib_buffer_get_current (p1) - 64, 64, STORE); + } + + if (PREDICT_TRUE(n_left_from > 2)) + { + vlib_buffer_t *p2; + p2 = vlib_get_buffer (vm, from[2]); + /* prefetch packet header and data */ + vlib_prefetch_buffer_header(p2, STORE); + CLIB_PREFETCH(vlib_buffer_get_current (p2), 64, STORE); + } + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + + if (is_input_v4) + { + ip4_header_t *ip40; + vlib_buffer_advance ( + p0, -(word) (sizeof(udp_header_t) + sizeof(ip4_header_t))); + ip40 = vlib_buffer_get_current (p0); + udp_0 = (udp_header_t *) (ip40 + 1); + } + else + { + ip6_header_t *ip60; + vlib_buffer_advance ( + p0, -(word) (sizeof(udp_header_t) + sizeof(ip6_header_t))); + ip60 = vlib_buffer_get_current (p0); + udp_0 = (udp_header_t *) (ip60 + 1); + } + + entry0 = hash_get_mem(lbm->vip_index_by_nodeport, &(udp_0->dst_port)); + + //Enqueue to next + vnet_buffer(p0)->ip.adj_index[VLIB_TX] = entry0[0]; + + if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) + { + lb_nodeport_trace_t *tr = vlib_add_trace (vm, node, p0, + sizeof(*tr)); + tr->vip_index = entry0[0]; + tr->node_port = (u32) clib_net_to_host_u16 (udp_0->dst_port); + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, pi0, + is_input_v4 ? + LB4_NODEPORT_NEXT_IP4_NAT4 : LB6_NODEPORT_NEXT_IP6_NAT6); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; + +} + +/** + * @brief Match NAT44 static mapping. + * + * @param sm NAT main. + * @param match Address and port to match. + * @param index index to the pool. + * + * @returns 0 if match found, otherwise -1. + */ +int +lb_nat44_mapping_match (lb_main_t *lbm, lb_snat4_key_t * match, u32 *index) +{ + clib_bihash_kv_8_8_t kv4, value; + clib_bihash_8_8_t *mapping_hash = &lbm->mapping_by_as4; + + kv4.key = match->as_u64; + kv4.value = 0; + if (clib_bihash_search_8_8 (mapping_hash, &kv4, &value)) + { + return 1; + } + + *index = value.value; + return 0; +} + +/** + * @brief Match NAT66 static mapping. + * + * @param sm NAT main. + * @param match Address and port to match. + * @param mapping External or local address and port of the matched mapping. + * + * @returns 0 if match found otherwise 1. + */ +int +lb_nat66_mapping_match (lb_main_t *lbm, lb_snat6_key_t * match, u32 *index) +{ + clib_bihash_kv_24_8_t kv6, value; + lb_snat6_key_t m_key6; + clib_bihash_24_8_t *mapping_hash = &lbm->mapping_by_as6; + + m_key6.addr.as_u64[0] = match->addr.as_u64[0]; + m_key6.addr.as_u64[1] = match->addr.as_u64[1]; + m_key6.port = match->port; + m_key6.protocol = 0; + m_key6.fib_index = 0; + + kv6.key[0] = m_key6.as_u64[0]; + kv6.key[1] = m_key6.as_u64[1]; + kv6.key[2] = m_key6.as_u64[2]; + kv6.value = 0; + if (clib_bihash_search_24_8 (mapping_hash, &kv6, &value)) + { + return 1; + } + + *index = value.value; + return 0; +} + +static uword +lb_nat_in2out_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame, u32 is_nat4) +{ + u32 n_left_from, *from, *to_next; + u32 next_index; + u32 pkts_processed = 0; + lb_main_t *lbm = &lb_main; + u32 stats_node_index; + + stats_node_index = + is_nat4 ? lb_nat4_in2out_node.index : lb_nat6_in2out_node.index; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + u32 sw_if_index0; + ip_csum_t csum; + u16 old_port0, new_port0; + udp_header_t * udp0; + tcp_header_t * tcp0; + + u32 proto0; + u32 rx_fib_index0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + next0 = LB_NAT4_IN2OUT_NEXT_LOOKUP; + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + rx_fib_index0 = ip4_fib_table_get_index_for_sw_if_index ( + sw_if_index0); + + if (is_nat4) + { + ip4_header_t * ip40; + u32 old_addr0, new_addr0; + lb_snat4_key_t key40; + lb_snat_mapping_t *sm40; + u32 index40; + + ip40 = vlib_buffer_get_current (b0); + udp0 = ip4_next_header (ip40); + tcp0 = (tcp_header_t *) udp0; + proto0 = lb_ip_proto_to_nat_proto (ip40->protocol); + + key40.addr = ip40->src_address; + key40.protocol = proto0; + key40.port = udp0->src_port; + key40.fib_index = rx_fib_index0; + + if (lb_nat44_mapping_match (lbm, &key40, &index40)) + { + next0 = LB_NAT4_IN2OUT_NEXT_DROP; + goto trace0; + } + + sm40 = pool_elt_at_index(lbm->snat_mappings, index40); + new_addr0 = sm40->src_ip.ip4.as_u32; + new_port0 = sm40->src_port; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm40->fib_index; + old_addr0 = ip40->src_address.as_u32; + ip40->src_address.as_u32 = new_addr0; + + csum = ip40->checksum; + csum = ip_csum_sub_even (csum, old_addr0); + csum = ip_csum_add_even (csum, new_addr0); + ip40->checksum = ip_csum_fold (csum); + + if (PREDICT_TRUE(proto0 == LB_NAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->src_port; + tcp0->src_port = new_port0; + + csum = tcp0->checksum; + csum = ip_csum_sub_even (csum, old_addr0); + csum = ip_csum_sub_even (csum, old_port0); + csum = ip_csum_add_even (csum, new_addr0); + csum = ip_csum_add_even (csum, new_port0); + tcp0->checksum = ip_csum_fold (csum); + } + else if (PREDICT_TRUE(proto0 == LB_NAT_PROTOCOL_UDP)) + { + old_port0 = udp0->src_port; + udp0->src_port = new_port0; + + csum = udp0->checksum; + csum = ip_csum_sub_even (csum, old_addr0); + csum = ip_csum_sub_even (csum, old_port0); + csum = ip_csum_add_even (csum, new_addr0); + csum = ip_csum_add_even (csum, new_port0); + udp0->checksum = ip_csum_fold (csum); + } + + pkts_processed += next0 != LB_NAT4_IN2OUT_NEXT_DROP; + } + else + { + ip6_header_t * ip60; + ip6_address_t old_addr0, new_addr0; + lb_snat6_key_t key60; + lb_snat_mapping_t *sm60; + u32 index60; + + ip60 = vlib_buffer_get_current (b0); + udp0 = ip6_next_header (ip60); + tcp0 = (tcp_header_t *) udp0; + proto0 = lb_ip_proto_to_nat_proto (ip60->protocol); + + key60.addr.as_u64[0] = ip60->src_address.as_u64[0]; + key60.addr.as_u64[1] = ip60->src_address.as_u64[1]; + key60.protocol = proto0; + key60.port = udp0->src_port; + key60.fib_index = rx_fib_index0; + + if (lb_nat66_mapping_match (lbm, &key60, &index60)) + { + next0 = LB_NAT6_IN2OUT_NEXT_DROP; + goto trace0; + } + + sm60 = pool_elt_at_index(lbm->snat_mappings, index60); + new_addr0.as_u64[0] = sm60->src_ip.as_u64[0]; + new_addr0.as_u64[1] = sm60->src_ip.as_u64[1]; + new_port0 = sm60->src_port; + vnet_buffer(b0)->sw_if_index[VLIB_TX] = sm60->fib_index; + old_addr0.as_u64[0] = ip60->src_address.as_u64[0]; + old_addr0.as_u64[1] = ip60->src_address.as_u64[1]; + ip60->src_address.as_u64[0] = new_addr0.as_u64[0]; + ip60->src_address.as_u64[1] = new_addr0.as_u64[1]; + + if (PREDICT_TRUE(proto0 == LB_NAT_PROTOCOL_TCP)) + { + old_port0 = tcp0->src_port; + tcp0->src_port = new_port0; + + csum = tcp0->checksum; + csum = ip_csum_sub_even (csum, old_addr0.as_u64[0]); + csum = ip_csum_sub_even (csum, old_addr0.as_u64[1]); + csum = ip_csum_add_even (csum, new_addr0.as_u64[0]); + csum = ip_csum_add_even (csum, new_addr0.as_u64[1]); + csum = ip_csum_sub_even (csum, old_port0); + csum = ip_csum_add_even (csum, new_port0); + tcp0->checksum = ip_csum_fold (csum); + } + else if (PREDICT_TRUE(proto0 == LB_NAT_PROTOCOL_UDP)) + { + old_port0 = udp0->src_port; + udp0->src_port = new_port0; + + csum = udp0->checksum; + csum = ip_csum_sub_even (csum, old_addr0.as_u64[0]); + csum = ip_csum_sub_even (csum, old_addr0.as_u64[1]); + csum = ip_csum_add_even (csum, new_addr0.as_u64[0]); + csum = ip_csum_add_even (csum, new_addr0.as_u64[1]); + csum = ip_csum_sub_even (csum, old_port0); + csum = ip_csum_add_even (csum, new_port0); + udp0->checksum = ip_csum_fold (csum); + } + + pkts_processed += next0 != LB_NAT4_IN2OUT_NEXT_DROP; + } + + trace0: if (PREDICT_FALSE( + (node->flags & VLIB_NODE_FLAG_TRACE) && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + lb_nat_trace_t *t = vlib_add_trace (vm, node, b0, sizeof(*t)); + t->rx_sw_if_index = sw_if_index0; + t->next_index = next0; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, stats_node_index, + LB_NAT_IN2OUT_ERROR_IN2OUT_PACKETS, + pkts_processed); + return frame->n_vectors; } static uword -lb6_gre4_node_fn (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * frame) +lb6_gre6_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) { - return lb_node_fn(vm, node, frame, 0, LB_ENCAP_TYPE_GRE4); + return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_GRE6); } static uword -lb4_gre6_node_fn (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * frame) +lb6_gre4_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) { - return lb_node_fn(vm, node, frame, 1, LB_ENCAP_TYPE_GRE6); + return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_GRE4); } static uword -lb4_gre4_node_fn (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * frame) +lb4_gre6_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) { - return lb_node_fn(vm, node, frame, 1, LB_ENCAP_TYPE_GRE4); + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_GRE6); } static uword -lb4_l3dsr_node_fn (vlib_main_t * vm, - vlib_node_runtime_t * node, vlib_frame_t * frame) +lb4_gre4_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) { - return lb_node_fn(vm, node, frame, 1, LB_ENCAP_TYPE_L3DSR); + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_GRE4); } -VLIB_REGISTER_NODE (lb6_gre6_node) = +static uword +lb4_l3dsr_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) { - .function = lb6_gre6_node_fn, - .name = "lb6-gre6", - .vector_size = sizeof (u32), - .format_trace = format_lb_trace, + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_L3DSR); +} - .n_errors = LB_N_ERROR, - .error_strings = lb_error_strings, +static uword +lb6_nat6_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_NAT6); +} - .n_next_nodes = LB_N_NEXT, - .next_nodes = - { - [LB_NEXT_DROP] = "error-drop" - }, -}; +static uword +lb4_nat4_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_NAT4); +} -VLIB_REGISTER_NODE (lb6_gre4_node) = +static uword +lb_nat4_in2out_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) { - .function = lb6_gre4_node_fn, - .name = "lb6-gre4", - .vector_size = sizeof (u32), - .format_trace = format_lb_trace, + return lb_nat_in2out_node_fn (vm, node, frame, 1); +} + +static uword +lb_nat6_in2out_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return lb_nat_in2out_node_fn (vm, node, frame, 0); +} - .n_errors = LB_N_ERROR, - .error_strings = lb_error_strings, +VLIB_REGISTER_NODE (lb6_gre6_node) = + { + .function = lb6_gre6_node_fn, + .name = "lb6-gre6", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; - .n_next_nodes = LB_N_NEXT, - .next_nodes = +VLIB_REGISTER_NODE (lb6_gre4_node) = { - [LB_NEXT_DROP] = "error-drop" - }, -}; + .function = lb6_gre4_node_fn, + .name = "lb6-gre4", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; VLIB_REGISTER_NODE (lb4_gre6_node) = -{ - .function = lb4_gre6_node_fn, - .name = "lb4-gre6", - .vector_size = sizeof (u32), - .format_trace = format_lb_trace, + { + .function = lb4_gre6_node_fn, + .name = "lb4-gre6", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; + +VLIB_REGISTER_NODE (lb4_gre4_node) = + { + .function = lb4_gre4_node_fn, + .name = "lb4-gre4", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; - .n_errors = LB_N_ERROR, - .error_strings = lb_error_strings, +VLIB_REGISTER_NODE (lb4_l3dsr_node) = + { + .function = lb4_l3dsr_node_fn, + .name = "lb4-l3dsr", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; - .n_next_nodes = LB_N_NEXT, - .next_nodes = +VLIB_REGISTER_NODE (lb6_nat6_node) = { - [LB_NEXT_DROP] = "error-drop" - }, -}; + .function = lb6_nat6_node_fn, + .name = "lb6-nat6", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; -VLIB_REGISTER_NODE (lb4_gre4_node) = +VLIB_REGISTER_NODE (lb4_nat4_node) = + { + .function = lb4_nat4_node_fn, + .name = "lb4-nat4", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; + +static uword +lb4_nodeport_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) { - .function = lb4_gre4_node_fn, - .name = "lb4-gre4", - .vector_size = sizeof (u32), - .format_trace = format_lb_trace, + return lb_nodeport_node_fn (vm, node, frame, 1); +} - .n_errors = LB_N_ERROR, - .error_strings = lb_error_strings, +static uword +lb6_nodeport_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return lb_nodeport_node_fn (vm, node, frame, 0); +} - .n_next_nodes = LB_N_NEXT, - .next_nodes = +VLIB_REGISTER_NODE (lb4_nodeport_node) = { - [LB_NEXT_DROP] = "error-drop" - }, -}; + .function = lb4_nodeport_node_fn, + .name = "lb4-nodeport", + .vector_size = sizeof(u32), + .format_trace = format_nodeport_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB4_NODEPORT_N_NEXT, + .next_nodes = + { + [LB4_NODEPORT_NEXT_IP4_NAT4] = "lb4-nat4", + [LB4_NODEPORT_NEXT_DROP] = "error-drop", + }, + }; -VLIB_REGISTER_NODE (lb4_l3dsr_node) = -{ - .function = lb4_l3dsr_node_fn, - .name = "lb4-l3dsr", - .vector_size = sizeof (u32), - .format_trace = format_lb_trace, +VLIB_REGISTER_NODE (lb6_nodeport_node) = + { + .function = lb6_nodeport_node_fn, + .name = "lb6-nodeport", + .vector_size = sizeof(u32), + .format_trace = format_nodeport_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB6_NODEPORT_N_NEXT, + .next_nodes = + { + [LB6_NODEPORT_NEXT_IP6_NAT6] = "lb6-nat6", + [LB6_NODEPORT_NEXT_DROP] = "error-drop", + }, + }; - .n_errors = LB_N_ERROR, - .error_strings = lb_error_strings, +VNET_FEATURE_INIT (lb_nat4_in2out_node_fn, static) = + { + .arc_name = "ip4-unicast", + .node_name = "lb-nat4-in2out", + .runs_before = VNET_FEATURES("ip4-lookup"), + }; + +VLIB_REGISTER_NODE (lb_nat4_in2out_node) = + { + .function = lb_nat4_in2out_node_fn, + .name = "lb-nat4-in2out", + .vector_size = sizeof(u32), + .format_trace = format_lb_nat_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_NAT4_IN2OUT_N_NEXT, + .next_nodes = + { + [LB_NAT4_IN2OUT_NEXT_DROP] = "error-drop", + [LB_NAT4_IN2OUT_NEXT_LOOKUP] = "ip4-lookup", + }, + }; - .n_next_nodes = LB_N_NEXT, - .next_nodes = +VNET_FEATURE_INIT (lb_nat6_in2out_node_fn, static) = { - [LB_NEXT_DROP] = "error-drop" - }, -}; + .arc_name = "ip6-unicast", + .node_name = "lb-nat6-in2out", + .runs_before = VNET_FEATURES("ip6-lookup"), + }; + +VLIB_REGISTER_NODE (lb_nat6_in2out_node) = + { + .function = lb_nat6_in2out_node_fn, + .name = "lb-nat6-in2out", + .vector_size = sizeof(u32), + .format_trace = format_lb_nat_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_NAT6_IN2OUT_N_NEXT, + .next_nodes = + { + [LB_NAT6_IN2OUT_NEXT_DROP] = "error-drop", + [LB_NAT6_IN2OUT_NEXT_LOOKUP] = "ip6-lookup", + }, + }; + -- cgit 1.2.3-korg