diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/plugins/lb/api.c | 16 | ||||
-rw-r--r-- | src/plugins/lb/cli.c | 91 | ||||
-rw-r--r-- | src/plugins/lb/lb.api | 13 | ||||
-rw-r--r-- | src/plugins/lb/lb.c | 454 | ||||
-rw-r--r-- | src/plugins/lb/lb.h | 114 | ||||
-rw-r--r-- | src/plugins/lb/lb_test.c | 2 | ||||
-rw-r--r-- | src/plugins/lb/node.c | 246 |
7 files changed, 735 insertions, 201 deletions
diff --git a/src/plugins/lb/api.c b/src/plugins/lb/api.c index afaf61c459a..18aae43b11d 100644 --- a/src/plugins/lb/api.c +++ b/src/plugins/lb/api.c @@ -109,11 +109,19 @@ vl_api_lb_add_del_vip_t_handler int rv = 0; lb_vip_add_args_t args; + if((mp->protocol != IP_PROTOCOL_TCP) + && (mp->protocol != IP_PROTOCOL_UDP)) + { + mp->protocol = ~0; + mp->port = 0; + } + memcpy (&(args.prefix.ip6), mp->ip_prefix, sizeof(args.prefix.ip6)); if (mp->is_del) { u32 vip_index; - if (!(rv = lb_vip_find_index(&(args.prefix), mp->prefix_length, &vip_index))) + if (!(rv = lb_vip_find_index(&(args.prefix), mp->prefix_length, + mp->protocol, mp->port, &vip_index))) rv = lb_vip_del(vip_index); } else { u32 vip_index; @@ -147,9 +155,7 @@ vl_api_lb_add_del_vip_t_handler else if ((mp->encap == LB_ENCAP_TYPE_NAT4) ||(mp->encap == LB_ENCAP_TYPE_NAT6)) { args.encap_args.srv_type = mp->type; - args.encap_args.port = ntohs(mp->port); args.encap_args.target_port = ntohs(mp->target_port); - args.encap_args.node_port = ntohs(mp->node_port); } rv = lb_vip_add(args, &vip_index); @@ -182,7 +188,6 @@ static void *vl_api_lb_add_del_vip_t_print s = format (s, "type %u ", mp->type); s = format (s, "port %u ", mp->port); s = format (s, "target_port %u ", mp->target_port); - s = format (s, "node_port %u ", mp->node_port); } s = format (s, "%u ", mp->new_flows_table_length); @@ -208,7 +213,8 @@ vl_api_lb_add_del_as_t_handler memcpy(&as_address.ip6, mp->as_address, sizeof(as_address.ip6)); - if ((rv = lb_vip_find_index(&vip_ip_prefix, mp->vip_prefix_length, &vip_index))) + if ((rv = lb_vip_find_index(&vip_ip_prefix, mp->vip_prefix_length, + mp->protocol, mp->port, &vip_index))) goto done; if (mp->is_del) diff --git a/src/plugins/lb/cli.c b/src/plugins/lb/cli.c index 76f05a586de..cfe8f381b13 100644 --- a/src/plugins/lb/cli.c +++ b/src/plugins/lb/cli.c @@ -24,12 +24,11 @@ lb_vip_command_fn (vlib_main_t * vm, lb_vip_add_args_t args; u8 del = 0; int ret; + u32 port = 0; u32 encap = 0; u32 dscp = ~0; u32 srv_type = LB_SRV_TYPE_CLUSTERIP; - u32 port = 0; u32 target_port = 0; - u32 node_port = 0; clib_error_t *error = 0; args.new_length = 1024; @@ -50,6 +49,16 @@ lb_vip_command_fn (vlib_main_t * vm, ; else if (unformat(line_input, "del")) del = 1; + else if (unformat(line_input, "protocol tcp")) + { + args.protocol = (u8)IP_PROTOCOL_TCP; + } + else if (unformat(line_input, "protocol udp")) + { + args.protocol = (u8)IP_PROTOCOL_UDP; + } + else if (unformat(line_input, "port %d", &port)) + ; else if (unformat(line_input, "encap gre4")) encap = LB_ENCAP_TYPE_GRE4; else if (unformat(line_input, "encap gre6")) @@ -66,12 +75,8 @@ lb_vip_command_fn (vlib_main_t * vm, srv_type = LB_SRV_TYPE_CLUSTERIP; else if (unformat(line_input, "type nodeport")) srv_type = LB_SRV_TYPE_NODEPORT; - else if (unformat(line_input, "port %d", &port)) - ; else if (unformat(line_input, "target_port %d", &target_port)) ; - else if (unformat(line_input, "node_port %d", &node_port)) - ; else { error = clib_error_return (0, "parse error: '%U'", format_unformat_error, line_input); @@ -79,6 +84,17 @@ lb_vip_command_fn (vlib_main_t * vm, } } + /* if port == 0, it means all-port VIP */ + if (port == 0) + { + args.protocol = ~0; + args.port = 0; + } + else + { + args.port = (u16)port; + } + if ((encap != LB_ENCAP_TYPE_L3DSR) && (dscp != ~0)) { error = clib_error_return(0, "lb_vip_add error: " @@ -135,9 +151,7 @@ lb_vip_command_fn (vlib_main_t * vm, || (encap == LB_ENCAP_TYPE_NAT6)) { args.encap_args.srv_type = (u8) srv_type; - args.encap_args.port = (u16) port; args.encap_args.target_port = (u16) target_port; - args.encap_args.node_port = (u16) node_port; } if ((ret = lb_vip_add(args, &index))) { @@ -147,7 +161,8 @@ lb_vip_command_fn (vlib_main_t * vm, vlib_cli_output(vm, "lb_vip_add ok %d", index); } } else { - if ((ret = lb_vip_find_index(&(args.prefix), args.plen, &index))) { + if ((ret = lb_vip_find_index(&(args.prefix), args.plen, + args.protocol, args.port, &index))) { error = clib_error_return (0, "lb_vip_find_index error %d", ret); goto done; } else if ((ret = lb_vip_del(index))) { @@ -165,9 +180,11 @@ done: VLIB_CLI_COMMAND (lb_vip_command, static) = { .path = "lb vip", - .short_help = "lb vip <prefix> [encap (gre6|gre4|l3dsr|nat4|nat6)] " + .short_help = "lb vip <prefix> " + "[protocol (tcp|udp) port <n>] " + "[encap (gre6|gre4|l3dsr|nat4|nat6)] " "[dscp <n>] " - "[type (nodeport|clusterip) port <n> target_port <n> node_port <n>] " + "[type (nodeport|clusterip) target_port <n>] " "[new_len <n>] [del]", .function = lb_vip_command_fn, }; @@ -181,6 +198,8 @@ lb_as_command_fn (vlib_main_t * vm, u8 vip_plen; ip46_address_t *as_array = 0; u32 vip_index; + u32 port = 0; + u8 protocol = 0; u8 del = 0; int ret; clib_error_t *error = 0; @@ -188,30 +207,52 @@ lb_as_command_fn (vlib_main_t * vm, if (!unformat_user (input, unformat_line_input, line_input)) return 0; - if (!unformat(line_input, "%U", unformat_ip46_prefix, &vip_prefix, &vip_plen, IP46_TYPE_ANY)) { + if (!unformat(line_input, "%U", unformat_ip46_prefix, + &vip_prefix, &vip_plen, IP46_TYPE_ANY)) + { error = clib_error_return (0, "invalid as address: '%U'", format_unformat_error, line_input); goto done; } - if ((ret = lb_vip_find_index(&vip_prefix, vip_plen, &vip_index))) { - error = clib_error_return (0, "lb_vip_find_index error %d", ret); - goto done; - } - while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) { - if (unformat(line_input, "%U", unformat_ip46_address, &as_addr, IP46_TYPE_ANY)) { + if (unformat(line_input, "%U", unformat_ip46_address, + &as_addr, IP46_TYPE_ANY)) + { vec_add1(as_array, as_addr); } else if (unformat(line_input, "del")) { del = 1; - } else { + } + else if (unformat(line_input, "protocol tcp")) + { + protocol = (u8)IP_PROTOCOL_TCP; + } + else if (unformat(line_input, "protocol udp")) + { + protocol = (u8)IP_PROTOCOL_UDP; + } + else if (unformat(line_input, "port %d", &port)) + ; + else { error = clib_error_return (0, "parse error: '%U'", format_unformat_error, line_input); goto done; } } + /* If port == 0, it means all-port VIP */ + if (port == 0) + { + protocol = ~0; + } + + if ((ret = lb_vip_find_index(&vip_prefix, vip_plen, protocol, + (u16)port, &vip_index))){ + error = clib_error_return (0, "lb_vip_find_index error %d", ret); + goto done; + } + if (!vec_len(as_array)) { error = clib_error_return (0, "No AS address provided"); goto done; @@ -221,12 +262,14 @@ lb_as_command_fn (vlib_main_t * vm, clib_warning("vip index is %d", vip_index); if (del) { - if ((ret = lb_vip_del_ass(vip_index, as_array, vec_len(as_array)))) { + if ((ret = lb_vip_del_ass(vip_index, as_array, vec_len(as_array)))) + { error = clib_error_return (0, "lb_vip_del_ass error %d", ret); goto done; } } else { - if ((ret = lb_vip_add_ass(vip_index, as_array, vec_len(as_array)))) { + if ((ret = lb_vip_add_ass(vip_index, as_array, vec_len(as_array)))) + { error = clib_error_return (0, "lb_vip_add_ass error %d", ret); goto done; } @@ -242,7 +285,8 @@ done: VLIB_CLI_COMMAND (lb_as_command, static) = { .path = "lb as", - .short_help = "lb as <vip-prefix> [<address> [<address> [...]]] [del]", + .short_help = "lb as <vip-prefix> [protocol (tcp|udp) port <n>]" + " [<address> [<address> [...]]] [del]", .function = lb_as_command_fn, }; @@ -335,8 +379,11 @@ lb_show_vips_command_fn (vlib_main_t * vm, if (unformat(&line_input, "verbose")) verbose = 1; + /* Hide dummy VIP */ pool_foreach(vip, lbm->vips, { + if (vip != lbm->vips) { vlib_cli_output(vm, "%U\n", verbose?format_lb_vip_detailed:format_lb_vip, vip); + } }); unformat_free (&line_input); diff --git a/src/plugins/lb/lb.api b/src/plugins/lb/lb.api index a9f05f253c5..de3d0c1e5e7 100644 --- a/src/plugins/lb/lb.api +++ b/src/plugins/lb/lb.api @@ -25,10 +25,11 @@ autoreply define lb_conf @param context - sender context, to match reply w/ request @param ip_prefix - IP address (IPv4 in lower order 32 bits). @param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4). + @param protocol - tcp or udp. + @param port - destination port. @param encap - Encap is ip4 GRE(0) or ip6 GRE(1) or L3DSR(2) or NAT4(3) or NAT6(4). @param dscp - DSCP bit corresponding to VIP(applicable in L3DSR mode only). @param type - service type(applicable in NAT4/NAT6 mode only). - @param port - service port(applicable in NAT4/NAT6 mode only). @param target_port - Pod's port corresponding to specific service(applicable in NAT4/NAT6 mode only). @param node_port - Node's port(applicable in NAT4/NAT6 mode only). @param new_flows_table_length - Size of the new connections flow table used @@ -40,10 +41,11 @@ autoreply define lb_add_del_vip { u32 context; u8 ip_prefix[16]; u8 prefix_length; + u8 protocol; + u16 port; u8 encap; u8 dscp; u8 type; - u16 port; u16 target_port; u16 node_port; u32 new_flows_table_length; @@ -54,7 +56,9 @@ autoreply define lb_add_del_vip { @param client_index - opaque cookie to identify the sender @param context - sender context, to match reply w/ request @param vip_ip_prefix - VIP IP address (IPv4 in lower order 32 bits). - @param vip_ip_prefix - VIP IP prefix length (96 + 'IPv4 prefix length' for IPv4). + @param vip_prefix_length - VIP IP prefix length (96 + 'IPv4 prefix length' for IPv4). + @param protocol - tcp or udp. + @param port - destination port. @param as_address - The application server address (IPv4 in lower order 32 bits). @param is_del - The AS should be removed. */ @@ -63,6 +67,9 @@ autoreply define lb_add_del_as { u32 context; u8 vip_ip_prefix[16]; u8 vip_prefix_length; + u8 protocol; + u16 port; u8 as_address[16]; u8 is_del; }; + diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c index d5dc3054fb4..fb62c217e7f 100644 --- a/src/plugins/lb/lb.c +++ b/src/plugins/lb/lb.c @@ -49,22 +49,44 @@ const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] = [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, }; -const static char * const lb_dpo_l3dsr_ip4[] = { "lb4-l3dsr" , NULL }; +const static char * const lb_dpo_gre4_ip4_port[] = { "lb4-gre4-port" , NULL }; +const static char * const lb_dpo_gre4_ip6_port[] = { "lb6-gre4-port" , NULL }; +const static char* const * const lb_dpo_gre4_port_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre4_ip4_port, + [DPO_PROTO_IP6] = lb_dpo_gre4_ip6_port, + }; + +const static char * const lb_dpo_gre6_ip4_port[] = { "lb4-gre6-port" , NULL }; +const static char * const lb_dpo_gre6_ip6_port[] = { "lb6-gre6-port" , NULL }; +const static char* const * const lb_dpo_gre6_port_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre6_ip4_port, + [DPO_PROTO_IP6] = lb_dpo_gre6_ip6_port, + }; + +const static char * const lb_dpo_l3dsr_ip4[] = {"lb4-l3dsr" , NULL}; const static char* const * const lb_dpo_l3dsr_nodes[DPO_PROTO_NUM] = { [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4, }; -const static char * const lb_dpo_nat4_ip4[] = { "lb4-nat4" , NULL }; -const static char* const * const lb_dpo_nat4_nodes[DPO_PROTO_NUM] = +const static char * const lb_dpo_l3dsr_ip4_port[] = {"lb4-l3dsr-port" , NULL}; +const static char* const * const lb_dpo_l3dsr_port_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4_port, + }; + +const static char * const lb_dpo_nat4_ip4_port[] = { "lb4-nat4-port" , NULL }; +const static char* const * const lb_dpo_nat4_port_nodes[DPO_PROTO_NUM] = { - [DPO_PROTO_IP4] = lb_dpo_nat4_ip4, + [DPO_PROTO_IP4] = lb_dpo_nat4_ip4_port, }; -const static char * const lb_dpo_nat6_ip6[] = { "lb6-nat6" , NULL }; -const static char* const * const lb_dpo_nat6_nodes[DPO_PROTO_NUM] = +const static char * const lb_dpo_nat6_ip6_port[] = { "lb6-nat6-port" , NULL }; +const static char* const * const lb_dpo_nat6_port_nodes[DPO_PROTO_NUM] = { - [DPO_PROTO_IP6] = lb_dpo_nat6_ip6, + [DPO_PROTO_IP6] = lb_dpo_nat6_ip6_port, }; u32 lb_hash_time_now(vlib_main_t * vm) @@ -137,6 +159,11 @@ u8 *format_lb_vip (u8 * s, va_list * args) pool_elts(vip->as_indexes), (vip->flags & LB_VIP_FLAGS_USED)?"":" removed"); + if (vip->port != 0) + { + s = format(s, " protocol:%u port:%u ", vip->protocol, vip->port); + } + if (vip->type == LB_VIP_TYPE_IP4_L3DSR) { s = format(s, " dscp:%u", vip->encap_args.dscp); @@ -144,14 +171,10 @@ u8 *format_lb_vip (u8 * s, va_list * args) else if ((vip->type == LB_VIP_TYPE_IP4_NAT4) || (vip->type == LB_VIP_TYPE_IP6_NAT6)) { - if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP) - s = format (s, " type:clusterip port:%u target_port:%u", - ntohs (vip->encap_args.port), - ntohs (vip->encap_args.target_port)); - else - s = format (s, " type:nodeport node_port:%u target_port:%u", - ntohs (vip->encap_args.node_port), - ntohs (vip->encap_args.target_port)); + s = format (s, " type:%s port:%u target_port:%u", + (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)?"clusterip": + "nodeport", + ntohs(vip->port), ntohs(vip->encap_args.target_port)); } return s; @@ -181,6 +204,13 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) format_white_space, indent, vip->new_flow_table_mask + 1); + if (vip->port != 0) + { + s = format(s, "%U protocol:%u port:%u\n", + format_white_space, indent, + vip->protocol, vip->port); + } + if (vip->type == LB_VIP_TYPE_IP4_L3DSR) { s = format(s, "%U dscp:%u\n", @@ -190,15 +220,11 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) else if ((vip->type == LB_VIP_TYPE_IP4_NAT4) || (vip->type == LB_VIP_TYPE_IP6_NAT6)) { - if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP) - s = format (s, "%U type:clusterip port:%u target_port:%u", - format_white_space, indent, ntohs (vip->encap_args.port), - ntohs (vip->encap_args.target_port)); - else - s = format (s, "%U type:nodeport node_port:%u target_port:%u", - format_white_space, indent, - ntohs (vip->encap_args.node_port), - ntohs (vip->encap_args.target_port)); + s = format (s, "%U type:%s port:%u target_port:%u", + format_white_space, indent, + (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)?"clusterip": + "nodeport", + ntohs(vip->port), ntohs(vip->encap_args.target_port)); } //Print counters @@ -237,14 +263,6 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) }); vec_free(count); - - /* - s = format(s, "%U new flows table:\n", format_white_space, indent); - lb_new_flow_entry_t *nfe; - vec_foreach(nfe, vip->new_flow_table) { - s = format(s, "%U %d: %d\n", format_white_space, indent, nfe - vip->new_flow_table, nfe->as_index); - } - */ return s; } @@ -283,11 +301,11 @@ static void lb_vip_garbage_collection(lb_vip_t *vip) pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; if (!(as->flags & LB_AS_FLAGS_USED) && //Not used - clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used + clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) { //Not referenced - if (lb_vip_is_nat4(vip)) { + if (lb_vip_is_nat4_port(vip)) { m_key4.addr = as->address.ip4; m_key4.port = vip->encap_args.target_port; m_key4.protocol = 0; @@ -300,8 +318,8 @@ static void lb_vip_garbage_collection(lb_vip_t *vip) kv4.value = m - lbm->snat_mappings; clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 0); - pool_put (lbm->snat_mappings, m); - } else if (lb_vip_is_nat6(vip)) { + pool_put (lbm->snat_mappings, m); + } else if (lb_vip_is_nat6_port(vip)) { m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0]; m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1]; m_key6.port = vip->encap_args.target_port; @@ -478,8 +496,13 @@ int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, return 0; } + + static -int lb_vip_find_index_with_lock(ip46_address_t *prefix, u8 plen, u32 *vip_index) +int lb_vip_port_find_index(ip46_address_t *prefix, u8 plen, + u8 protocol, u16 port, + lb_lkp_type_t lkp_type, + u32 *vip_index) { lb_main_t *lbm = &lb_main; lb_vip_t *vip; @@ -489,19 +512,57 @@ int lb_vip_find_index_with_lock(ip46_address_t *prefix, u8 plen, u32 *vip_index) if ((vip->flags & LB_AS_FLAGS_USED) && vip->plen == plen && vip->prefix.as_u64[0] == prefix->as_u64[0] && - vip->prefix.as_u64[1] == prefix->as_u64[1]) { - *vip_index = vip - lbm->vips; - return 0; - } + vip->prefix.as_u64[1] == prefix->as_u64[1]) + { + if((lkp_type == LB_LKP_SAME_IP_PORT && + vip->protocol == protocol && + vip->port == port) || + (lkp_type == LB_LKP_ALL_PORT_IP && + vip->port == 0) || + (lkp_type == LB_LKP_DIFF_IP_PORT && + (vip->protocol != protocol || + vip->port != port) ) ) + { + *vip_index = vip - lbm->vips; + return 0; + } + } }); return VNET_API_ERROR_NO_SUCH_ENTRY; } -int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index) +static +int lb_vip_port_find_index_with_lock(ip46_address_t *prefix, u8 plen, + u8 protocol, u16 port, u32 *vip_index) +{ + return lb_vip_port_find_index(prefix, plen, protocol, port, + LB_LKP_SAME_IP_PORT, vip_index); +} + +static +int lb_vip_port_find_all_port_vip(ip46_address_t *prefix, u8 plen, + u32 *vip_index) +{ + return lb_vip_port_find_index(prefix, plen, ~0, 0, + LB_LKP_ALL_PORT_IP, vip_index); +} + +/* Find out per-port-vip entry with different protocol and port */ +static +int lb_vip_port_find_diff_port(ip46_address_t *prefix, u8 plen, + u8 protocol, u16 port, u32 *vip_index) +{ + return lb_vip_port_find_index(prefix, plen, protocol, port, + LB_LKP_DIFF_IP_PORT, vip_index); +} + +int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u8 protocol, + u16 port, u32 *vip_index) { int ret; lb_get_writer_lock(); - ret = lb_vip_find_index_with_lock(prefix, plen, vip_index); + ret = lb_vip_port_find_index_with_lock(prefix, plen, + protocol, port, vip_index); lb_put_writer_lock(); return ret; } @@ -516,7 +577,8 @@ static int lb_as_find_index_vip(lb_vip_t *vip, ip46_address_t *address, u32 *as_ as = &lbm->ass[*asi]; if (as->vip_index == (vip - lbm->vips) && as->address.as_u64[0] == address->as_u64[0] && - as->address.as_u64[1] == address->as_u64[1]) { + as->address.as_u64[1] == address->as_u64[1]) + { *as_index = as - lbm->ass; return 0; } @@ -609,23 +671,23 @@ next: } as->next_hop_fib_entry_index = - fib_table_entry_special_add(0, + fib_table_entry_special_add(0, &nh, FIB_SOURCE_RR, FIB_ENTRY_FLAG_NONE); as->next_hop_child_index = - fib_entry_child_add(as->next_hop_fib_entry_index, + fib_entry_child_add(as->next_hop_fib_entry_index, lbm->fib_node_type, as - lbm->ass); lb_as_stack(as); - if ( lb_vip_is_nat4(vip) || lb_vip_is_nat6(vip) ) + if ( lb_vip_is_nat4_port(vip) || lb_vip_is_nat6_port(vip) ) { /* Add SNAT static mapping */ pool_get (lbm->snat_mappings, m); memset (m, 0, sizeof (*m)); - if (lb_vip_is_nat4(vip)) { + if (lb_vip_is_nat4_port(vip)) { lb_snat4_key_t m_key4; clib_bihash_kv_8_8_t kv4; m_key4.addr = as->address.ip4; @@ -636,16 +698,15 @@ next: if (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP) { m->src_ip.ip4 = vip->prefix.ip4; - m->src_port = vip->encap_args.port; } else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT) { m->src_ip.ip4 = lbm->ip4_src_address; - m->src_port = vip->encap_args.node_port; } m->src_ip_is_ipv6 = 0; m->as_ip.ip4 = as->address.ip4; - m->as_ip_is_ipv6 = 0;; + m->as_ip_is_ipv6 = 0; + m->src_port = vip->port; m->target_port = vip->encap_args.target_port; m->vrf_id = 0; m->fib_index = 0; @@ -666,18 +727,17 @@ next: { m->src_ip.ip6.as_u64[0] = vip->prefix.ip6.as_u64[0]; m->src_ip.ip6.as_u64[1] = vip->prefix.ip6.as_u64[1]; - m->src_port = vip->encap_args.port; } else if (vip->encap_args.srv_type == LB_SRV_TYPE_NODEPORT) { m->src_ip.ip6.as_u64[0] = lbm->ip6_src_address.as_u64[0]; m->src_ip.ip6.as_u64[1] = lbm->ip6_src_address.as_u64[1]; - m->src_port = vip->encap_args.node_port; } m->src_ip_is_ipv6 = 1; m->as_ip.ip6.as_u64[0] = as->address.ip6.as_u64[0]; m->as_ip.ip6.as_u64[1] = as->address.ip6.as_u64[1]; m->as_ip_is_ipv6 = 1; + m->src_port = vip->port; m->target_port = vip->encap_args.target_port; m->vrf_id = 0; m->fib_index = 0; @@ -707,6 +767,7 @@ int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n) lb_main_t *lbm = &lb_main; u32 now = (u32) vlib_time_now(vlib_get_main()); u32 *ip = 0; + u32 as_index = 0; lb_vip_t *vip; if (!(vip = lb_vip_get_by_index(vip_index))) { @@ -715,8 +776,7 @@ int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n) u32 *indexes = NULL; while (n--) { - u32 i; - if (lb_as_find_index_vip(vip, &addresses[n], &i)) { + if (lb_as_find_index_vip(vip, &addresses[n], &as_index)) { vec_free(indexes); return VNET_API_ERROR_NO_SUCH_ENTRY; } @@ -730,7 +790,7 @@ int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n) } } - vec_add1(indexes, i); + vec_add1(indexes, as_index); next: continue; } @@ -757,20 +817,71 @@ int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n) lb_get_writer_lock(); int ret = lb_vip_del_ass_withlock(vip_index, addresses, n); lb_put_writer_lock(); + return ret; } +static int +lb_vip_prefix_index_alloc (lb_main_t *lbm) +{ + /* + * Check for dynamically allocaetd instance number. + */ + u32 bit; + + bit = clib_bitmap_first_clear (lbm->vip_prefix_indexes); + + lbm->vip_prefix_indexes = clib_bitmap_set(lbm->vip_prefix_indexes, bit, 1); + + return bit; +} + +static int +lb_vip_prefix_index_free (lb_main_t *lbm, u32 instance) +{ + + if (clib_bitmap_get (lbm->vip_prefix_indexes, instance) == 0) + { + return -1; + } + + lbm->vip_prefix_indexes = clib_bitmap_set (lbm->vip_prefix_indexes, + instance, 0); + + return 0; +} + /** * Add the VIP adjacency to the ip4 or ip6 fib */ -static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) +static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip, + u32 *vip_prefix_index) { dpo_proto_t proto = 0; dpo_type_t dpo_type = 0; + u32 vip_idx = 0; + + if (vip->port != 0) + { + /* for per-port vip, if VIP adjacency has been added, + * no need to add adjacency. */ + if (!lb_vip_port_find_diff_port(&(vip->prefix), vip->plen, + vip->protocol, vip->port, &vip_idx)) + { + return; + } + + /* Allocate an index for per-port vip */ + *vip_prefix_index = lb_vip_prefix_index_alloc(lbm); + } + else + { + *vip_prefix_index = vip - lbm->vips; + } dpo_id_t dpo = DPO_INVALID; fib_prefix_t pfx = {}; - if (lb_vip_is_ip4(vip)) { + if (lb_vip_is_ip4(vip->type)) { pfx.fp_addr.ip4 = vip->prefix.ip4; pfx.fp_len = vip->plen - 96; pfx.fp_proto = FIB_PROTOCOL_IP4; @@ -786,14 +897,20 @@ static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) dpo_type = lbm->dpo_gre4_type; else if (lb_vip_is_gre6(vip)) dpo_type = lbm->dpo_gre6_type; + else if (lb_vip_is_gre4_port(vip)) + dpo_type = lbm->dpo_gre4_port_type; + else if (lb_vip_is_gre6_port(vip)) + dpo_type = lbm->dpo_gre6_port_type; else if (lb_vip_is_l3dsr(vip)) dpo_type = lbm->dpo_l3dsr_type; - else if(lb_vip_is_nat4(vip)) - dpo_type = lbm->dpo_nat4_type; - else if (lb_vip_is_nat6(vip)) - dpo_type = lbm->dpo_nat6_type; - - dpo_set(&dpo, dpo_type, proto, vip - lbm->vips); + else if (lb_vip_is_l3dsr_port(vip)) + dpo_type = lbm->dpo_l3dsr_port_type; + else if(lb_vip_is_nat4_port(vip)) + dpo_type = lbm->dpo_nat4_port_type; + else if (lb_vip_is_nat6_port(vip)) + dpo_type = lbm->dpo_nat6_port_type; + + dpo_set(&dpo, dpo_type, proto, *vip_prefix_index); fib_table_entry_special_dpo_add(0, &pfx, FIB_SOURCE_PLUGIN_HI, @@ -803,12 +920,75 @@ static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) } /** + * Add the VIP filter entry + */ +static int lb_vip_add_port_filter(lb_main_t *lbm, lb_vip_t *vip, + u32 vip_prefix_index, u32 vip_idx) +{ + vip_port_key_t key; + clib_bihash_kv_8_8_t kv; + + key.vip_prefix_index = vip_prefix_index; + key.protocol = vip->protocol; + key.port = clib_host_to_net_u16(vip->port); + key.rsv = 0; + + kv.key = key.as_u64; + kv.value = vip_idx; + clib_bihash_add_del_8_8(&lbm->vip_index_per_port, &kv, 1); + + return 0; +} + +/** + * Del the VIP filter entry + */ +static int lb_vip_del_port_filter(lb_main_t *lbm, lb_vip_t *vip) +{ + vip_port_key_t key; + clib_bihash_kv_8_8_t kv, value; + lb_vip_t *m = 0; + + key.vip_prefix_index = vip->vip_prefix_index; + key.protocol = vip->protocol; + key.port = clib_host_to_net_u16(vip->port); + + kv.key = key.as_u64; + if(clib_bihash_search_8_8(&lbm->vip_index_per_port, &kv, &value) == 0) + m = pool_elt_at_index (lbm->vips, value.value); + ASSERT (m); + + kv.value = m - lbm->vips; + clib_bihash_add_del_8_8(&lbm->vip_index_per_port, &kv, 0); + + return 0; +} + +/** * Deletes the adjacency associated with the VIP */ static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip) { fib_prefix_t pfx = {}; - if (lb_vip_is_ip4(vip)) { + u32 vip_idx = 0; + + if (vip->port != 0) + { + /* If this vip adjacency is used by other per-port vip, + * no need to del this adjacency. */ + if (!lb_vip_port_find_diff_port(&(vip->prefix), vip->plen, + vip->protocol, vip->port, &vip_idx)) + { + lb_put_writer_lock(); + return; + } + + /* Return vip_prefix_index for per-port vip */ + lb_vip_prefix_index_free(lbm, vip->vip_prefix_index); + + } + + if (lb_vip_is_ip4(vip->type)) { pfx.fp_addr.ip4 = vip->prefix.ip4; pfx.fp_len = vip->plen - 96; pfx.fp_proto = FIB_PROTOCOL_IP4; @@ -826,15 +1006,47 @@ int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index) vlib_main_t *vm = vlib_get_main(); lb_vip_t *vip; lb_vip_type_t type = args.type; - u16 node_port = args.encap_args.node_port; + u32 vip_prefix_index = 0; lb_get_writer_lock(); ip46_prefix_normalize(&(args.prefix), args.plen); - if (!lb_vip_find_index_with_lock(&(args.prefix), args.plen, vip_index)) { - lb_put_writer_lock(); - return VNET_API_ERROR_VALUE_EXIST; - } + if (!lb_vip_port_find_index_with_lock(&(args.prefix), args.plen, + args.protocol, args.port, + vip_index)) + { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + + /* Make sure we can't add a per-port VIP entry + * when there already is an all-port VIP for the same prefix. */ + if ((args.port != 0) && + !lb_vip_port_find_all_port_vip(&(args.prefix), args.plen, vip_index)) + { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + + /* Make sure we can't add a all-port VIP entry + * when there already is an per-port VIP for the same prefix. */ + if ((args.port == 0) && + !lb_vip_port_find_diff_port(&(args.prefix), args.plen, + args.protocol, args.port, vip_index)) + { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + + /* Make sure all VIP for a given prefix (using different ports) have the same type. */ + if ((args.port != 0) && + !lb_vip_port_find_diff_port(&(args.prefix), args.plen, + args.protocol, args.port, vip_index) + && (args.type != lbm->vips[*vip_index].type)) + { + lb_put_writer_lock(); + return VNET_API_ERROR_INVALID_ARGUMENT; + } if (!is_pow2(args.new_length)) { lb_put_writer_lock(); @@ -842,23 +1054,19 @@ int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index) } if (ip46_prefix_is_ip4(&(args.prefix), args.plen) && - (type != LB_VIP_TYPE_IP4_GRE4) && - (type != LB_VIP_TYPE_IP4_GRE6) && - (type != LB_VIP_TYPE_IP4_L3DSR) && - (type != LB_VIP_TYPE_IP4_NAT4)) { + !lb_vip_is_ip4(type)) { lb_put_writer_lock(); return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; } if ((!ip46_prefix_is_ip4(&(args.prefix), args.plen)) && - (type != LB_VIP_TYPE_IP6_GRE4) && - (type != LB_VIP_TYPE_IP6_GRE6) && - (type != LB_VIP_TYPE_IP6_NAT6)) { + !lb_vip_is_ip6(type)) { lb_put_writer_lock(); return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; } - if ((type == LB_VIP_TYPE_IP4_L3DSR) && (args.encap_args.dscp >= 64 ) ) + if ((type == LB_VIP_TYPE_IP4_L3DSR) && + (args.encap_args.dscp >= 64) ) { lb_put_writer_lock(); return VNET_API_ERROR_VALUE_EXIST; @@ -870,6 +1078,16 @@ int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index) //Init memcpy (&(vip->prefix), &(args.prefix), sizeof(args.prefix)); vip->plen = args.plen; + if (args.port != 0) + { + vip->protocol = args.protocol; + vip->port = args.port; + } + else + { + vip->protocol = (u8)~0; + vip->port = 0; + } vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main()); vip->type = args.type; @@ -877,12 +1095,10 @@ int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index) vip->encap_args.dscp = args.encap_args.dscp; } else if ((args.type == LB_VIP_TYPE_IP4_NAT4) - ||(args.type == LB_VIP_TYPE_IP6_NAT6)) { + ||(args.type == LB_VIP_TYPE_IP6_NAT6)) { vip->encap_args.srv_type = args.encap_args.srv_type; - vip->encap_args.port = clib_host_to_net_u16(args.encap_args.port); vip->encap_args.target_port = clib_host_to_net_u16(args.encap_args.target_port); - vip->encap_args.node_port = clib_host_to_net_u16(node_port); } vip->flags = LB_VIP_FLAGS_USED; @@ -899,20 +1115,20 @@ int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index) vip->new_flow_table_mask = args.new_length - 1; vip->new_flow_table = 0; - //Create a new flow hash table full of the default entry + //Update flow hash table lb_vip_update_new_flow_table(vip); //Create adjacency to direct traffic - lb_vip_add_adjacency(lbm, vip); + lb_vip_add_adjacency(lbm, vip, &vip_prefix_index); - if ( (lb_vip_is_nat4(vip) || lb_vip_is_nat6(vip)) + if ( (lb_vip_is_nat4_port(vip) || lb_vip_is_nat6_port(vip)) && (args.encap_args.srv_type == LB_SRV_TYPE_NODEPORT) ) { u32 key; uword * entry; //Create maping from nodeport to vip_index - key = clib_host_to_net_u16(node_port); + key = clib_host_to_net_u16(args.port); entry = hash_get_mem (lbm->vip_index_by_nodeport, &key); if (entry) { lb_put_writer_lock(); @@ -922,12 +1138,17 @@ int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index) hash_set_mem (lbm->vip_index_by_nodeport, &key, vip - lbm->vips); /* receive packets destined to NodeIP:NodePort */ - udp_register_dst_port (vm, node_port, lb4_nodeport_node.index, 1); - udp_register_dst_port (vm, node_port, lb6_nodeport_node.index, 0); + udp_register_dst_port (vm, args.port, lb4_nodeport_node.index, 1); + udp_register_dst_port (vm, args.port, lb6_nodeport_node.index, 0); } - //Return result *vip_index = vip - lbm->vips; + //Create per-port vip filtering table + if (args.port != 0) + { + lb_vip_add_port_filter(lbm, vip, vip_prefix_index, *vip_index); + vip->vip_prefix_index = vip_prefix_index; + } lb_put_writer_lock(); return 0; @@ -937,6 +1158,11 @@ int lb_vip_del(u32 vip_index) { lb_main_t *lbm = &lb_main; lb_vip_t *vip; + + /* Does not remove default vip, i.e. vip_index = 0 */ + if (vip_index == 0) + return 0; + lb_get_writer_lock(); if (!(vip = lb_vip_get_by_index(vip_index))) { lb_put_writer_lock(); @@ -963,6 +1189,12 @@ int lb_vip_del(u32 vip_index) //Delete adjacency lb_vip_del_adjacency(lbm, vip); + //Delete per-port vip filtering entry + if (vip->port != 0) + { + lb_vip_del_port_filter(lbm, vip); + } + //Set the VIP as unused vip->flags &= ~LB_VIP_FLAGS_USED; @@ -1020,15 +1252,21 @@ lb_as_stack (lb_as_t *as) dpo_type = lbm->dpo_gre4_type; else if (lb_vip_is_gre6(vip)) dpo_type = lbm->dpo_gre6_type; + else if (lb_vip_is_gre4_port(vip)) + dpo_type = lbm->dpo_gre4_port_type; + else if (lb_vip_is_gre6_port(vip)) + dpo_type = lbm->dpo_gre6_port_type; else if (lb_vip_is_l3dsr(vip)) dpo_type = lbm->dpo_l3dsr_type; - else if(lb_vip_is_nat4(vip)) - dpo_type = lbm->dpo_nat4_type; - else if (lb_vip_is_nat6(vip)) - dpo_type = lbm->dpo_nat6_type; + else if (lb_vip_is_l3dsr_port(vip)) + dpo_type = lbm->dpo_l3dsr_port_type; + else if(lb_vip_is_nat4_port(vip)) + dpo_type = lbm->dpo_nat4_port_type; + else if (lb_vip_is_nat6_port(vip)) + dpo_type = lbm->dpo_nat6_port_type; dpo_stack(dpo_type, - lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6, + lb_vip_is_ip4(vip->type)?DPO_PROTO_IP4:DPO_PROTO_IP6, &as->dpo, fib_entry_contribute_ip_forwarding( as->next_hop_fib_entry_index)); @@ -1036,7 +1274,7 @@ lb_as_stack (lb_as_t *as) static fib_node_back_walk_rc_t lb_fib_node_back_walk_notify (fib_node_t *node, - fib_node_back_walk_ctx_t *ctx) + fib_node_back_walk_ctx_t *ctx) { lb_as_stack(lb_as_from_fib_node(node)); return (FIB_NODE_BACK_WALK_CONTINUE); @@ -1082,6 +1320,7 @@ lb_init (vlib_main_t * vm) lbm->vnet_main = vnet_get_main (); lbm->vlib_main = vm; + lb_vip_t *default_vip; lb_as_t *default_as; fib_node_vft_t lb_fib_node_vft = { .fnv_get = lb_fib_node_get_node, @@ -1094,7 +1333,15 @@ lb_init (vlib_main_t * vm) .dv_format = format_lb_dpo, }; + //Allocate and init default VIP. lbm->vips = 0; + pool_get(lbm->vips, default_vip); + default_vip->prefix.ip6.as_u64[0] = 0xffffffffffffffffL; + default_vip->prefix.ip6.as_u64[1] = 0xffffffffffffffffL; + default_vip->protocol = ~0; + default_vip->port = 0; + default_vip->flags = LB_VIP_FLAGS_USED; + lbm->per_cpu = 0; vec_validate(lbm->per_cpu, tm->n_vlib_mains - 1); lbm->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES); @@ -1106,9 +1353,18 @@ lb_init (vlib_main_t * vm) lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL; lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes); lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes); - lbm->dpo_l3dsr_type = dpo_register_new_type(&lb_vft, lb_dpo_l3dsr_nodes); - lbm->dpo_nat4_type = dpo_register_new_type(&lb_vft, lb_dpo_nat4_nodes); - lbm->dpo_nat6_type = dpo_register_new_type(&lb_vft, lb_dpo_nat6_nodes); + lbm->dpo_gre4_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_gre4_port_nodes); + lbm->dpo_gre6_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_gre6_port_nodes); + lbm->dpo_l3dsr_type = dpo_register_new_type(&lb_vft, + lb_dpo_l3dsr_nodes); + lbm->dpo_l3dsr_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_l3dsr_port_nodes); + lbm->dpo_nat4_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_nat4_port_nodes); + lbm->dpo_nat6_port_type = dpo_register_new_type(&lb_vft, + lb_dpo_nat6_port_nodes); lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft); //Init AS reference counters @@ -1126,6 +1382,10 @@ lb_init (vlib_main_t * vm) lbm->vip_index_by_nodeport = hash_create_mem (0, sizeof(u16), sizeof (uword)); + clib_bihash_init_8_8 (&lbm->vip_index_per_port, + "vip_index_per_port", LB_VIP_PER_PORT_BUCKETS, + LB_VIP_PER_PORT_MEMORY_SIZE); + clib_bihash_init_8_8 (&lbm->mapping_by_as4, "mapping_by_as4", LB_MAPPING_BUCKETS, LB_MAPPING_MEMORY_SIZE); diff --git a/src/plugins/lb/lb.h b/src/plugins/lb/lb.h index c6773a4108f..3177f892ae5 100644 --- a/src/plugins/lb/lb.h +++ b/src/plugins/lb/lb.h @@ -47,6 +47,9 @@ #define LB_MAPPING_BUCKETS 1024 #define LB_MAPPING_MEMORY_SIZE 64<<20 +#define LB_VIP_PER_PORT_BUCKETS 1024 +#define LB_VIP_PER_PORT_MEMORY_SIZE 64<<20 + typedef enum { LB_NEXT_DROP, LB_N_NEXT, @@ -187,6 +190,17 @@ typedef enum { } lb_encap_type_t; /** + * Lookup type + */ + +typedef enum { + LB_LKP_SAME_IP_PORT, + LB_LKP_DIFF_IP_PORT, + LB_LKP_ALL_PORT_IP, + LB_LKP_N_TYPES, +} lb_lkp_type_t; + +/** * The load balancer supports IPv4 and IPv6 traffic * and GRE4, GRE6, L3DSR and NAT4, NAT6 encap. */ @@ -214,14 +228,8 @@ typedef struct { /* Service type. clusterip or nodeport */ u8 srv_type; - /* Service port. network byte order */ - u16 port; - /* Pod's port corresponding to specific service. network byte order */ u16 target_port; - - /* Node's port, can access service via NodeIP:node_port. network byte order */ - u16 node_port; }; /* DSCP bits for L3DSR */ u8 dscp; @@ -229,8 +237,21 @@ typedef struct { }; } lb_vip_encap_args_t; +typedef struct { + /* all fields in NET byte order */ + union { + struct { + u32 vip_prefix_index; + u16 port; + u8 protocol; + u8 rsv; + }; + u64 as_u64; + }; +} vip_port_key_t; + /** - * Load balancing service is provided per VIP. + * Load balancing service is provided per VIP+protocol+port. * In this data model, a VIP can be a whole prefix. * But load balancing only * occurs on a per-source-address/port basis. Meaning that if a given source @@ -275,6 +296,15 @@ typedef struct { */ u8 plen; + /* tcp or udp. If not per-port vip, set to ~0 */ + u8 protocol; + + /* tcp port or udp port. If not per-port vip, set to ~0 */ + u16 port; + + /* Valid for per-port vip */ + u32 vip_prefix_index; + /** * The type of traffic for this. * LB_TYPE_UNDEFINED if unknown. @@ -301,36 +331,57 @@ typedef struct { u32 *as_indexes; } lb_vip_t; -#define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 \ - || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \ - || (vip)->type == LB_VIP_TYPE_IP4_L3DSR \ - || (vip)->type == LB_VIP_TYPE_IP4_NAT4 ) - -#define lb_vip_is_gre4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \ - || (vip)->type == LB_VIP_TYPE_IP4_GRE4) +#define lb_vip_is_ip4(type) (type == LB_VIP_TYPE_IP4_GRE6 \ + || type == LB_VIP_TYPE_IP4_GRE4 \ + || type == LB_VIP_TYPE_IP4_L3DSR \ + || type == LB_VIP_TYPE_IP4_NAT4 ) -#define lb_vip_is_gre6(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE6 \ - || (vip)->type == LB_VIP_TYPE_IP4_GRE6) +#define lb_vip_is_ip6(type) (type == LB_VIP_TYPE_IP6_GRE6 \ + || type == LB_VIP_TYPE_IP6_GRE4 \ + || type == LB_VIP_TYPE_IP6_NAT6 ) #define lb_encap_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 \ || (vip)->type == LB_VIP_TYPE_IP4_GRE4 \ || (vip)->type == LB_VIP_TYPE_IP4_L3DSR \ || (vip)->type == LB_VIP_TYPE_IP4_NAT4 ) +#define lb_vip_is_gre4(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE4 \ + || (vip)->type == LB_VIP_TYPE_IP4_GRE4) \ + && ((vip)->port == 0)) + + +#define lb_vip_is_gre6(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \ + || (vip)->type == LB_VIP_TYPE_IP4_GRE6) \ + && ((vip)->port == 0)) + +#define lb_vip_is_gre4_port(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE4 \ + || (vip)->type == LB_VIP_TYPE_IP4_GRE4) \ + && ((vip)->port != 0)) + +#define lb_vip_is_gre6_port(vip) (((vip)->type == LB_VIP_TYPE_IP6_GRE6 \ + || (vip)->type == LB_VIP_TYPE_IP4_GRE6) \ + && ((vip)->port != 0)) + always_inline bool lb_vip_is_l3dsr(const lb_vip_t *vip) { - return vip->type == LB_VIP_TYPE_IP4_L3DSR; + return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port ==0); +} + +always_inline bool +lb_vip_is_l3dsr_port(const lb_vip_t *vip) +{ + return (vip->type == LB_VIP_TYPE_IP4_L3DSR && vip->port !=0); } always_inline bool -lb_vip_is_nat4(const lb_vip_t *vip) +lb_vip_is_nat4_port(const lb_vip_t *vip) { - return vip->type == LB_VIP_TYPE_IP4_NAT4; + return (vip->type == LB_VIP_TYPE_IP4_NAT4 && vip->port !=0); } always_inline bool -lb_vip_is_nat6(const lb_vip_t *vip) +lb_vip_is_nat6_port(const lb_vip_t *vip) { - return vip->type == LB_VIP_TYPE_IP6_NAT6; + return (vip->type == LB_VIP_TYPE_IP6_NAT6 && vip->port !=0); } format_function_t format_lb_vip; @@ -422,6 +473,11 @@ typedef struct { lb_vip_t *vips; /** + * bitmap for vip prefix to support per-port vip + */ + uword *vip_prefix_indexes; + + /** * Pool of ASs. * ASs are referenced by address and vip index. * The first element (index 0) is special and used only to fill @@ -479,15 +535,20 @@ typedef struct { */ dpo_type_t dpo_gre4_type; dpo_type_t dpo_gre6_type; + dpo_type_t dpo_gre4_port_type; + dpo_type_t dpo_gre6_port_type; dpo_type_t dpo_l3dsr_type; - dpo_type_t dpo_nat4_type; - dpo_type_t dpo_nat6_type; - + dpo_type_t dpo_l3dsr_port_type; + dpo_type_t dpo_nat4_port_type; + dpo_type_t dpo_nat6_port_type; /** * Node type for registering to fib changes. */ fib_node_type_t fib_node_type; + /* lookup per_port vip by key */ + clib_bihash_8_8_t vip_index_per_port; + /* Find a static mapping by AS IP : target_port */ clib_bihash_8_8_t mapping_by_as4; clib_bihash_24_8_t mapping_by_as6; @@ -511,6 +572,8 @@ typedef struct { typedef struct { ip46_address_t prefix; u8 plen; + u8 protocol; + u16 port; lb_vip_type_t type; u32 new_length; lb_vip_encap_args_t encap_args; @@ -537,7 +600,8 @@ int lb_vip_add(lb_vip_add_args_t args, u32 *vip_index); int lb_vip_del(u32 vip_index); -int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index); +int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u8 protocol, + u16 port, u32 *vip_index); #define lb_vip_get_by_index(index) (pool_is_free_index(lb_main.vips, index)?NULL:pool_elt_at_index(lb_main.vips, index)) diff --git a/src/plugins/lb/lb_test.c b/src/plugins/lb/lb_test.c index fc498706222..83766272fd7 100644 --- a/src/plugins/lb/lb_test.c +++ b/src/plugins/lb/lb_test.c @@ -226,7 +226,7 @@ static int api_lb_add_del_as (vat_main_t * vam) #define foreach_vpe_api_msg \ _(lb_conf, "<ip4-src-addr> <ip6-src-address> <sticky_buckets_per_core> <flow_timeout>") \ _(lb_add_del_vip, "<ip-prefix> [gre4|gre6|l3dsr|nat4|nat6] " \ - "<dscp> <port> <target_port> <node_port> " \ + "<dscp> <port> <target_port> " \ "<new_table_len> [del]") \ _(lb_add_del_as, "<vip-ip-prefix> <address> [del]") diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c index b33ea22b5c1..8163e35a06e 100644 --- a/src/plugins/lb/node.c +++ b/src/plugins/lb/node.c @@ -173,14 +173,27 @@ lb_node_get_other_ports6 (ip6_header_t *ip60) return 0; } -static_always_inline u32 -lb_node_get_hash (vlib_buffer_t *p, u8 is_input_v4) +static_always_inline void +lb_node_get_hash (lb_main_t *lbm, vlib_buffer_t *p, u8 is_input_v4, + u32 *hash, u32 *vip_idx, u8 per_port_vip) { - u32 hash; + vip_port_key_t key; + clib_bihash_kv_8_8_t kv, value; + + /* For vip case, retrieve vip index for ip lookup */ + *vip_idx = vnet_buffer (p)->ip.adj_index[VLIB_TX]; + + if (per_port_vip) + { + /* For per-port-vip case, ip lookup stores dummy index */ + key.vip_prefix_index = *vip_idx; + } + if (is_input_v4) { ip4_header_t *ip40; u64 ports; + ip40 = vlib_buffer_get_current (p); if (PREDICT_TRUE( ip40->protocol == IP_PROTOCOL_TCP @@ -190,13 +203,20 @@ lb_node_get_hash (vlib_buffer_t *p, u8 is_input_v4) else ports = lb_node_get_other_ports4 (ip40); - hash = lb_hash_hash (*((u64 *) &ip40->address_pair), ports, 0, 0, 0); + *hash = lb_hash_hash (*((u64 *) &ip40->address_pair), ports, 0, 0, 0); + + if (per_port_vip) + { + key.protocol = ip40->protocol; + key.port = (u16)(ports & 0xFFFF); + } } else { ip6_header_t *ip60; ip60 = vlib_buffer_get_current (p); u64 ports; + if (PREDICT_TRUE( ip60->protocol == IP_PROTOCOL_TCP || ip60->protocol == IP_PROTOCOL_UDP)) @@ -205,18 +225,39 @@ lb_node_get_hash (vlib_buffer_t *p, u8 is_input_v4) else ports = lb_node_get_other_ports6 (ip60); - hash = lb_hash_hash (ip60->src_address.as_u64[0], + *hash = lb_hash_hash (ip60->src_address.as_u64[0], ip60->src_address.as_u64[1], ip60->dst_address.as_u64[0], ip60->dst_address.as_u64[1], ports); + + if (per_port_vip) + { + key.protocol = ip60->protocol; + key.port = (u16)(ports & 0xFFFF); + } + } + + /* For per-port-vip case, retrieve vip index for vip_port_filter table */ + if (per_port_vip) + { + kv.key = key.as_u64; + if (clib_bihash_search_8_8(&lbm->vip_index_per_port, &kv, &value) < 0) + { + /* return default vip */ + *vip_idx = 0; + return; + } + *vip_idx = value.value; } - return hash; } static_always_inline uword -lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, +lb_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, u8 is_input_v4, //Compile-time parameter stating that is input is v4 (or v6) - lb_encap_type_t encap_type) //Compile-time parameter is GRE4/GRE6/L3DSR/NAT4/NAT6 + lb_encap_type_t encap_type, //Compile-time parameter is GRE4/GRE6/L3DSR/NAT4/NAT6 + u8 per_port_vip) //Compile-time parameter stating that is per_port_vip or not { lb_main_t *lbm = &lb_main; u32 n_left_from, *from, next_index, *to_next, n_left_to_next; @@ -229,8 +270,13 @@ lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, next_index = node->cached_next_index; u32 nexthash0 = 0; + u32 next_vip_idx0 = ~0; if (PREDICT_TRUE(n_left_from > 0)) - nexthash0 = lb_node_get_hash (vlib_get_buffer (vm, from[0]), is_input_v4); + { + vlib_buffer_t *p0 = vlib_get_buffer (vm, from[0]); + lb_node_get_hash (lbm, p0, is_input_v4, &nexthash0, + &next_vip_idx0, per_port_vip); + } while (n_left_from > 0) { @@ -240,17 +286,21 @@ lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, u32 pi0; vlib_buffer_t *p0; lb_vip_t *vip0; - u32 asindex0; + u32 asindex0 = 0; u16 len0; u32 available_index0; u8 counter = 0; u32 hash0 = nexthash0; + u32 vip_index0 = next_vip_idx0; + u32 next0; if (PREDICT_TRUE(n_left_from > 1)) { vlib_buffer_t *p1 = vlib_get_buffer (vm, from[1]); //Compute next hash and prefetch bucket - nexthash0 = lb_node_get_hash (p1, is_input_v4); + lb_node_get_hash (lbm, p1, is_input_v4, + &nexthash0, &next_vip_idx0, + per_port_vip); lb_hash_prefetch_bucket (sticky_ht, nexthash0); //Prefetch for encap, next CLIB_PREFETCH(vlib_buffer_get_current (p1) - 64, 64, STORE); @@ -272,8 +322,8 @@ lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, n_left_to_next -= 1; p0 = vlib_get_buffer (vm, pi0); - vip0 = pool_elt_at_index(lbm->vips, - vnet_buffer (p0)->ip.adj_index[VLIB_TX]); + + vip0 = pool_elt_at_index(lbm->vips, vip_index0); if (is_input_v4) { @@ -290,7 +340,7 @@ lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, } lb_hash_get (sticky_ht, hash0, - vnet_buffer (p0)->ip.adj_index[VLIB_TX], lb_time, + vip_index0, lb_time, &available_index0, &asindex0); if (PREDICT_TRUE(asindex0 != ~0)) @@ -320,7 +370,7 @@ lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, //Note that when there is no AS configured, an entry is configured anyway. //But no configured AS is not something that should happen lb_hash_put (sticky_ht, hash0, asindex0, - vnet_buffer (p0)->ip.adj_index[VLIB_TX], + vip_index0, available_index0, lb_time); } else @@ -333,7 +383,7 @@ lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, vlib_increment_simple_counter ( &lbm->vip_counters[counter], thread_index, - vnet_buffer (p0)->ip.adj_index[VLIB_TX], + vip_index0, 1); //Now let's encap @@ -436,8 +486,7 @@ lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, csum, lbm->ass[asindex0].address.ip4.as_u32); ip40->checksum = ip_csum_fold (csum); - if ((ip40->protocol == IP_PROTOCOL_UDP) - || (uh->dst_port == vip0->encap_args.port)) + if (ip40->protocol == IP_PROTOCOL_UDP) { uh->dst_port = vip0->encap_args.target_port; csum = uh->checksum; @@ -448,7 +497,7 @@ lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, } else { - next_index = LB_NEXT_DROP; + asindex0 = 0; } } else if ((is_input_v4 == 0) && (encap_type == LB_ENCAP_TYPE_NAT6)) @@ -481,25 +530,25 @@ lb_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame, } else { - next_index = LB_NEXT_DROP; + asindex0 = 0; } } } + next0 = lbm->ass[asindex0].dpo.dpoi_next_node; + //Note that this is going to error if asindex0 == 0 + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = + lbm->ass[asindex0].dpo.dpoi_index; if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) { lb_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof(*tr)); tr->as_index = asindex0; - tr->vip_index = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + tr->vip_index = vip_index0; } //Enqueue to next - //Note that this is going to error if asindex0 == 0 - vnet_buffer (p0)->ip.adj_index[VLIB_TX] = - lbm->ass[asindex0].dpo.dpoi_index; vlib_validate_buffer_enqueue_x1( - vm, node, next_index, to_next, n_left_to_next, pi0, - lbm->ass[asindex0].dpo.dpoi_next_node); + vm, node, next_index, to_next, n_left_to_next, pi0, next0); } vlib_put_next_frame (vm, node, next_index, n_left_to_next); } @@ -887,49 +936,84 @@ static uword lb6_gre6_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_GRE6); + return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_GRE6, 0); } static uword lb6_gre4_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_GRE4); + return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_GRE4, 0); } static uword lb4_gre6_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_GRE6); + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_GRE6, 0); } static uword lb4_gre4_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * frame) { - return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_GRE4); + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_GRE4, 0); +} + +static uword +lb6_gre6_port_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_GRE6, 1); +} + +static uword +lb6_gre4_port_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_GRE4, 1); +} + +static uword +lb4_gre6_port_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_GRE6, 1); +} + +static uword +lb4_gre4_port_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_GRE4, 1); } static uword lb4_l3dsr_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * frame) + vlib_frame_t * frame) { - return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_L3DSR); + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_L3DSR, 0); } static uword -lb6_nat6_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * frame) +lb4_l3dsr_port_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) { - return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_NAT6); + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_L3DSR, 1); } static uword -lb4_nat4_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, - vlib_frame_t * frame) +lb6_nat6_port_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return lb_node_fn (vm, node, frame, 0, LB_ENCAP_TYPE_NAT6, 1); +} + +static uword +lb4_nat4_port_node_fn (vlib_main_t * vm, vlib_node_runtime_t * node, + vlib_frame_t * frame) { - return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_NAT4); + return lb_node_fn (vm, node, frame, 1, LB_ENCAP_TYPE_NAT4, 1); } static uword @@ -952,7 +1036,8 @@ VLIB_REGISTER_NODE (lb6_gre6_node) = .name = "lb6-gre6", .vector_size = sizeof(u32), .format_trace = format_lb_trace, - .n_errors = LB_N_ERROR, .error_strings = lb_error_strings, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, .n_next_nodes = LB_N_NEXT, .next_nodes = { [LB_NEXT_DROP] = "error-drop" }, @@ -992,7 +1077,72 @@ VLIB_REGISTER_NODE (lb4_gre4_node) = .format_trace = format_lb_trace, .n_errors = LB_N_ERROR, .error_strings = lb_error_strings, - .n_next_nodes = LB_N_NEXT, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; + +VLIB_REGISTER_NODE (lb6_gre6_port_node) = + { + .function = lb6_gre6_port_node_fn, + .name = "lb6-gre6-port", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; + +VLIB_REGISTER_NODE (lb6_gre4_port_node) = + { + .function = lb6_gre4_port_node_fn, + .name = "lb6-gre4-port", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; + +VLIB_REGISTER_NODE (lb4_gre6_port_node) = + { + .function = lb4_gre6_port_node_fn, + .name = "lb4-gre6-port", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; + +VLIB_REGISTER_NODE (lb4_gre4_port_node) = + { + .function = lb4_gre4_port_node_fn, + .name = "lb4-gre4-port", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { [LB_NEXT_DROP] = "error-drop" }, + }; + +VLIB_REGISTER_NODE (lb4_l3dsr_port_node) = + { + .function = lb4_l3dsr_port_node_fn, + .name = "lb4-l3dsr-port", + .vector_size = sizeof(u32), + .format_trace = format_lb_trace, + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + .n_next_nodes = LB_N_NEXT, .next_nodes = { [LB_NEXT_DROP] = "error-drop" }, }; @@ -1010,10 +1160,10 @@ VLIB_REGISTER_NODE (lb4_l3dsr_node) = { [LB_NEXT_DROP] = "error-drop" }, }; -VLIB_REGISTER_NODE (lb6_nat6_node) = +VLIB_REGISTER_NODE (lb6_nat6_port_node) = { - .function = lb6_nat6_node_fn, - .name = "lb6-nat6", + .function = lb6_nat6_port_node_fn, + .name = "lb6-nat6-port", .vector_size = sizeof(u32), .format_trace = format_lb_trace, .n_errors = LB_N_ERROR, @@ -1023,10 +1173,10 @@ VLIB_REGISTER_NODE (lb6_nat6_node) = { [LB_NEXT_DROP] = "error-drop" }, }; -VLIB_REGISTER_NODE (lb4_nat4_node) = +VLIB_REGISTER_NODE (lb4_nat4_port_node) = { - .function = lb4_nat4_node_fn, - .name = "lb4-nat4", + .function = lb4_nat4_port_node_fn, + .name = "lb4-nat4-port", .vector_size = sizeof(u32), .format_trace = format_lb_trace, .n_errors = LB_N_ERROR, @@ -1061,7 +1211,7 @@ VLIB_REGISTER_NODE (lb4_nodeport_node) = .n_next_nodes = LB4_NODEPORT_N_NEXT, .next_nodes = { - [LB4_NODEPORT_NEXT_IP4_NAT4] = "lb4-nat4", + [LB4_NODEPORT_NEXT_IP4_NAT4] = "lb4-nat4-port", [LB4_NODEPORT_NEXT_DROP] = "error-drop", }, }; @@ -1077,7 +1227,7 @@ VLIB_REGISTER_NODE (lb6_nodeport_node) = .n_next_nodes = LB6_NODEPORT_N_NEXT, .next_nodes = { - [LB6_NODEPORT_NEXT_IP6_NAT6] = "lb6-nat6", + [LB6_NODEPORT_NEXT_IP6_NAT6] = "lb6-nat6-port", [LB6_NODEPORT_NEXT_DROP] = "error-drop", }, }; |