diff options
author | Pierre Pfister <ppfister@cisco.com> | 2016-10-05 09:38:21 +0100 |
---|---|---|
committer | Damjan Marion <dmarion.lists@gmail.com> | 2016-10-05 09:33:07 +0000 |
commit | d4bc9af55fc9a7bb5133da8c863569497cb92cd0 (patch) | |
tree | 6a64ccf779e28162e6c6bd260b0602011aadc9d6 | |
parent | 4c20e7197707aa95b144b289704e9e97335db17d (diff) |
Load Balancer: Use FIB 2.0
This patch fixes load balancer, which was not working
since FIB 2.0.
Two FIB DPO types are defined:
- One for IPv4 GRE
- One for IPv6 GRE
When an AS is created, the plugin automatically uses
the result from the FIB in order to transmit the packet.
Therefore, the packet does not need to visit ip-lookup twice.
The 'bypass' command was removed, as it is now done
automatically using this process.
Change-Id: Ib505ba31bfc67897eaff752821087821c360360a
Signed-off-by: Pierre Pfister <ppfister@cisco.com>
-rw-r--r-- | plugins/lb-plugin/lb/cli.c | 41 | ||||
-rw-r--r-- | plugins/lb-plugin/lb/lb.c | 286 | ||||
-rw-r--r-- | plugins/lb-plugin/lb/lb.h | 66 | ||||
-rw-r--r-- | plugins/lb-plugin/lb/node.c | 367 |
4 files changed, 358 insertions, 402 deletions
diff --git a/plugins/lb-plugin/lb/cli.c b/plugins/lb-plugin/lb/cli.c index 398572ce..b59c6426 100644 --- a/plugins/lb-plugin/lb/cli.c +++ b/plugins/lb-plugin/lb/cli.c @@ -17,47 +17,6 @@ #include <lb/util.h> static clib_error_t * -lb_bypass_command_fn (vlib_main_t * vm, - unformat_input_t * input, vlib_cli_command_t * cmd) -{ - unformat_input_t _line_input, *line_input = &_line_input; - ip46_address_t vip_prefix, as_addr; - u8 vip_plen; - u32 vip_index; - u8 disable = 0; - int ret; - - if (!unformat_user (input, unformat_line_input, line_input)) - return 0; - - if (!unformat(line_input, "%U", unformat_ip46_prefix, &vip_prefix, &vip_plen, IP46_TYPE_ANY)) - return clib_error_return (0, "invalid vip prefix: '%U'", - format_unformat_error, line_input); - - if ((ret = lb_vip_find_index(&vip_prefix, vip_plen, &vip_index))) - return clib_error_return (0, "lb_vip_find_index error %d", ret); - - if (!unformat(line_input, "%U", unformat_ip46_address, &as_addr, IP46_TYPE_ANY)) - return clib_error_return (0, "invalid as address: '%U'", - format_unformat_error, line_input); - - if (unformat(line_input, "disable")) - disable = 1; - - if ((ret = lb_as_lookup_bypass(vip_index, &as_addr, disable))) - return clib_error_return (0, "lb_as_lookup_bypass error %d", ret); - - return 0; -} - -VLIB_CLI_COMMAND (lb_bypass_command, static) = -{ - .path = "lb bypass", - .short_help = "lb bypass <prefix> <address> [disable]", - .function = lb_bypass_command_fn, -}; - -static clib_error_t * lb_vip_command_fn (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) { diff --git a/plugins/lb-plugin/lb/lb.c b/plugins/lb-plugin/lb/lb.c index 140c221a..6af4697e 100644 --- a/plugins/lb-plugin/lb/lb.c +++ b/plugins/lb-plugin/lb/lb.c @@ -28,6 +28,25 @@ lb_main_t lb_main; #define lb_get_writer_lock() do {} while(__sync_lock_test_and_set (lb_main.writer_lock, 1)) #define lb_put_writer_lock() lb_main.writer_lock[0] = 0 +static void lb_as_stack (lb_as_t *as); + + +const static char * const lb_dpo_gre4_ip4[] = { "lb4-gre4" , NULL }; +const static char * const lb_dpo_gre4_ip6[] = { "lb6-gre4" , NULL }; +const static char* const * const lb_dpo_gre4_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre4_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre4_ip6, + }; + +const static char * const lb_dpo_gre6_ip4[] = { "lb4-gre6" , NULL }; +const static char * const lb_dpo_gre6_ip6[] = { "lb6-gre6" , NULL }; +const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre6_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, + }; + u32 lb_hash_time_now(vlib_main_t * vm) { return (u32) (vlib_time_now(vm) + 10000); @@ -143,12 +162,12 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) u32 *as_index; pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; - s = format(s, "%U %U %d buckets %d flows adj:%u %s\n", + s = format(s, "%U %U %d buckets %d flows dpo:%u %s\n", format_white_space, indent, format_ip46_address, &as->address, IP46_TYPE_ANY, count[as - lbm->ass], vlib_refcount_get(&lbm->as_refcount, as - lbm->ass), - as->adj_index, + as->dpo.dpoi_index, (as->flags & LB_AS_FLAGS_USED)?"used":" removed"); }); @@ -164,7 +183,6 @@ u8 *format_lb_vip_detailed (u8 * s, va_list * args) return s; } - typedef struct { u32 as_index; u32 last; @@ -195,11 +213,18 @@ static void lb_vip_garbage_collection(lb_vip_t *vip) pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; if (!(as->flags & LB_AS_FLAGS_USED) && //Not used - clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used - (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) { //Not referenced - pool_put(vip->as_indexes, as_index); - pool_put(lbm->ass, as); - } + clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used + (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) + { //Not referenced + fib_entry_child_remove(as->next_hop_fib_entry_index, + as->next_hop_child_index); + fib_table_entry_delete_index(as->next_hop_fib_entry_index, + FIB_SOURCE_RR); + as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID; + + pool_put(vip->as_indexes, as_index); + pool_put(lbm->ass, as); + } }); } @@ -449,7 +474,6 @@ next: //Update reused ASs vec_foreach(ip, to_be_updated) { lbm->ass[*ip].flags = LB_AS_FLAGS_USED; - lbm->ass[*ip].adj_index = ~0; } vec_free(to_be_updated); @@ -461,9 +485,36 @@ next: as->address = addresses[*ip]; as->flags = LB_AS_FLAGS_USED; as->vip_index = vip_index; - as->adj_index = ~0; pool_get(vip->as_indexes, as_index); *as_index = as - lbm->ass; + + /* + * become a child of the FIB entry + * so we are informed when its forwarding changes + */ + fib_prefix_t nh = {}; + if (lb_vip_is_gre4(vip)) { + nh.fp_addr.ip4 = as->address.ip4; + nh.fp_len = 32; + nh.fp_proto = FIB_PROTOCOL_IP4; + } else { + nh.fp_addr.ip6 = as->address.ip6; + nh.fp_len = 128; + nh.fp_proto = FIB_PROTOCOL_IP6; + } + + as->next_hop_fib_entry_index = + fib_table_entry_special_add(0, + &nh, + FIB_SOURCE_RR, + FIB_ENTRY_FLAG_NONE, + ADJ_INDEX_INVALID); + as->next_hop_child_index = + fib_entry_child_add(as->next_hop_fib_entry_index, + lbm->fib_node_type, + as - lbm->ass); + + lb_as_stack(as); } vec_free(to_be_added); @@ -535,100 +586,33 @@ int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n) return ret; } -int lb_as_lookup_bypass(u32 vip_index, ip46_address_t *address, u8 is_disable) -{ - /* lb_get_writer_lock(); */ - /* lb_main_t *lbm = &lb_main; */ - /* u32 as_index; */ - /* lb_as_t *as; */ - /* lb_vip_t *vip; */ - - /* if (!(vip = lb_vip_get_by_index(vip_index)) || */ - /* lb_as_find_index_vip(vip, address, &as_index)) { */ - /* lb_put_writer_lock(); */ - /* return VNET_API_ERROR_NO_SUCH_ENTRY; */ - /* } */ - - /* as = &lbm->ass[as_index]; */ - - /* if (is_disable) { */ - /* as->adj_index = ~0; */ - /* } else if (lb_vip_is_gre4(vip)) { */ - /* uword *p = ip4_get_route (&ip4_main, 0, 0, as->address.ip4.as_u8, 32); */ - /* if (p == 0) { */ - /* lb_put_writer_lock(); */ - /* return VNET_API_ERROR_NO_SUCH_ENTRY; */ - /* } */ - /* u32 ai = (u32)p[0]; */ - /* ip_lookup_main_t *lm4 = &ip4_main.lookup_main; */ - /* ip_adjacency_t *adj4 = ip_get_adjacency (lm4, ai); */ - /* if (adj4->lookup_next_index != IP_LOOKUP_NEXT_REWRITE) { */ - /* lb_put_writer_lock(); */ - /* return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE; */ - /* } */ - - /* as->adj_index = ai; */ - /* } else { */ - /* u32 ai = ip6_get_route (&ip6_main, 0, 0, &as->address.ip6, 128); */ - /* if (ai == 0) { */ - /* lb_put_writer_lock(); */ - /* return VNET_API_ERROR_NO_SUCH_ENTRY; */ - /* } */ - - /* ip_lookup_main_t *lm6 = &ip6_main.lookup_main; */ - /* ip_adjacency_t *adj6 = ip_get_adjacency (lm6, ai); */ - /* if (adj6->lookup_next_index != IP_LOOKUP_NEXT_REWRITE) { */ - /* lb_put_writer_lock(); */ - /* return VNET_API_ERROR_INCORRECT_ADJACENCY_TYPE; */ - /* } */ - - /* as->adj_index = ai; */ - /* } */ - /* lb_put_writer_lock(); */ - return 0; -} - - /** * Add the VIP adjacency to the ip4 or ip6 fib */ static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) { - /* ip_adjacency_t adj; */ - /* //Adjacency */ - /* memset (&adj, 0, sizeof (adj)); */ - /* adj.explicit_fib_index = ~0; */ - /* lb_adj_data_t *ad = (lb_adj_data_t *) &adj.opaque; */ - /* ad->vip_index = vip - lbm->vips; */ - - /* ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned */ - /* u32 lookup_next_index = lbm->ip_lookup_next_index[vip->type]; */ - - /* if (lb_vip_is_ip4(vip)) { */ - /* adj.lookup_next_index = lookup_next_index; */ - /* ip4_add_del_route_args_t route_args = {}; */ - /* ip4_main_t *im4 = &ip4_main; */ - /* route_args.table_index_or_table_id = 0; */ - /* route_args.flags = IP4_ROUTE_FLAG_ADD; */ - /* route_args.dst_address = vip->prefix.ip4; */ - /* route_args.dst_address_length = vip->plen - 96; */ - /* route_args.adj_index = ~0; */ - /* route_args.add_adj = &adj; */ - /* route_args.n_add_adj = 1; */ - /* ip4_add_del_route (im4, &route_args); */ - /* } else { */ - /* adj.lookup_next_index = lookup_next_index; */ - /* ip6_add_del_route_args_t route_args = {}; */ - /* ip6_main_t *im6 = &ip6_main; */ - /* route_args.table_index_or_table_id = 0; */ - /* route_args.flags = IP6_ROUTE_FLAG_ADD; */ - /* route_args.dst_address = vip->prefix.ip6; */ - /* route_args.dst_address_length = vip->plen; */ - /* route_args.adj_index = ~0; */ - /* route_args.add_adj = &adj; */ - /* route_args.n_add_adj = 1; */ - /* ip6_add_del_route (im6, &route_args); */ - /* } */ + dpo_proto_t proto = 0; + dpo_id_t dpo = DPO_NULL; + fib_prefix_t pfx = {}; + if (lb_vip_is_ip4(vip)) { + pfx.fp_addr.ip4 = vip->prefix.ip4; + pfx.fp_len = vip->plen - 96; + pfx.fp_proto = FIB_PROTOCOL_IP4; + proto = DPO_PROTO_IP4; + } else { + pfx.fp_addr.ip6 = vip->prefix.ip6; + pfx.fp_len = vip->plen; + pfx.fp_proto = FIB_PROTOCOL_IP6; + proto = DPO_PROTO_IP6; + } + dpo_set(&dpo, lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type, + proto, vip - lbm->vips); + fib_table_entry_special_dpo_add(0, + &pfx, + FIB_SOURCE_PLUGIN_HI, + FIB_ENTRY_FLAG_EXCLUSIVE, + &dpo); + dpo_reset(&dpo); } /** @@ -636,30 +620,17 @@ static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) */ static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip) { - /* ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned */ - /* if (lb_vip_is_ip4(vip)) { */ - /* ip4_main_t *im4 = &ip4_main; */ - /* ip4_add_del_route_args_t route_args = {}; */ - /* route_args.table_index_or_table_id = 0; */ - /* route_args.flags = IP4_ROUTE_FLAG_DEL; */ - /* route_args.dst_address = vip->prefix.ip4; */ - /* route_args.dst_address_length = vip->plen - 96; */ - /* route_args.adj_index = ~0; */ - /* route_args.add_adj = NULL; */ - /* route_args.n_add_adj = 0; */ - /* ip4_add_del_route (im4, &route_args); */ - /* } else { */ - /* ip6_main_t *im6 = &ip6_main; */ - /* ip6_add_del_route_args_t route_args = {}; */ - /* route_args.table_index_or_table_id = 0; */ - /* route_args.flags = IP6_ROUTE_FLAG_DEL; */ - /* route_args.dst_address = vip->prefix.ip6; */ - /* route_args.dst_address_length = vip->plen; */ - /* route_args.adj_index = ~0; */ - /* route_args.add_adj = NULL; */ - /* route_args.n_add_adj = 0; */ - /* ip6_add_del_route (im6, &route_args); */ - /* } */ + fib_prefix_t pfx = {}; + if (lb_vip_is_ip4(vip)) { + pfx.fp_addr.ip4 = vip->prefix.ip4; + pfx.fp_len = vip->plen - 96; + pfx.fp_proto = FIB_PROTOCOL_IP4; + } else { + pfx.fp_addr.ip6 = vip->prefix.ip6; + pfx.fp_len = vip->plen; + pfx.fp_proto = FIB_PROTOCOL_IP6; + } + fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI); } int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_length, u32 *vip_index) @@ -766,12 +737,76 @@ vlib_plugin_register (vlib_main_t * vm, return error; } + +u8 *format_lb_dpo (u8 * s, va_list * va) +{ + index_t index = va_arg (*va, index_t); + CLIB_UNUSED(u32 indent) = va_arg (*va, u32); + lb_main_t *lbm = &lb_main; + lb_vip_t *vip = pool_elt_at_index (lbm->vips, index); + return format (s, "%U", format_lb_vip, vip); +} + +static void lb_dpo_lock (dpo_id_t *dpo) {} +static void lb_dpo_unlock (dpo_id_t *dpo) {} + +static fib_node_t * +lb_fib_node_get_node (fib_node_index_t index) +{ + lb_main_t *lbm = &lb_main; + lb_as_t *as = pool_elt_at_index (lbm->ass, index); + return (&as->fib_node); +} + +static void +lb_fib_node_last_lock_gone (fib_node_t *node) +{ +} + +static lb_as_t * +lb_as_from_fib_node (fib_node_t *node) +{ + return ((lb_as_t*)(((char*)node) - + STRUCT_OFFSET_OF(lb_as_t, fib_node))); +} + +static void +lb_as_stack (lb_as_t *as) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip = &lbm->vips[as->vip_index]; + dpo_stack(lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type, + lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6, + &as->dpo, + fib_entry_contribute_ip_forwarding( + as->next_hop_fib_entry_index)); +} + +static fib_node_back_walk_rc_t +lb_fib_node_back_walk_notify (fib_node_t *node, + fib_node_back_walk_ctx_t *ctx) +{ + lb_as_stack(lb_as_from_fib_node(node)); + return (FIB_NODE_BACK_WALK_CONTINUE); +} + clib_error_t * lb_init (vlib_main_t * vm) { vlib_thread_main_t *tm = vlib_get_thread_main (); lb_main_t *lbm = &lb_main; lb_as_t *default_as; + fib_node_vft_t lb_fib_node_vft = { + .fnv_get = lb_fib_node_get_node, + .fnv_last_lock = lb_fib_node_last_lock_gone, + .fnv_back_walk = lb_fib_node_back_walk_notify, + }; + dpo_vft_t lb_vft = { + .dv_lock = lb_dpo_lock, + .dv_unlock = lb_dpo_unlock, + .dv_format = format_lb_dpo, + }; + lbm->vips = 0; lbm->per_cpu = 0; vec_validate(lbm->per_cpu, tm->n_vlib_mains - 1); @@ -782,6 +817,9 @@ lb_init (vlib_main_t * vm) lbm->ip4_src_address.as_u32 = 0xffffffff; lbm->ip6_src_address.as_u64[0] = 0xffffffffffffffffL; lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL; + lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes); + lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes); + lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft); //Init AS reference counters vlib_refcount_init(&lbm->as_refcount); @@ -790,7 +828,7 @@ lb_init (vlib_main_t * vm) lbm->ass = 0; pool_get(lbm->ass, default_as); default_as->flags = 0; - default_as->adj_index = ~0; + default_as->dpo.dpoi_next_node = LB_NEXT_DROP; default_as->vip_index = ~0; default_as->address.ip6.as_u64[0] = 0xffffffffffffffffL; default_as->address.ip6.as_u64[1] = 0xffffffffffffffffL; diff --git a/plugins/lb-plugin/lb/lb.h b/plugins/lb-plugin/lb/lb.h index 14a4d8a3..09cfde3e 100644 --- a/plugins/lb-plugin/lb/lb.h +++ b/plugins/lb-plugin/lb/lb.h @@ -35,18 +35,30 @@ #include <vnet/vnet.h> #include <vnet/ip/ip.h> +#include <vnet/dpo/dpo.h> +#include <vnet/fib/fib_table.h> #include <lb/lbhash.h> #define LB_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10 #define LB_DEFAULT_FLOW_TIMEOUT 40 +typedef enum { + LB_NEXT_DROP, + LB_N_NEXT, +} lb_next_t; + /** * Each VIP is configured with a set of * application server. */ typedef struct { /** + * Registration to FIB event. + */ + fib_node_t fib_node; + + /** * Destination address used to tunnel traffic towards * that application server. * The address is also used as ID and pseudo-random @@ -55,13 +67,6 @@ typedef struct { ip46_address_t address; /** - * Second ip lookup can be avoided by sending directly the packet - * to ip-rewrite with a configured adjacency. - * When set to ~0, the packets are sent to ip6-lookup. - */ - u32 adj_index; - - /** * ASs are indexed by address and VIP Index. * Which means there will be duplicated if the same server * address is used for multiple VIPs. @@ -86,6 +91,22 @@ typedef struct { * may happen. */ u32 last_used; + + /** + * The FIB entry index for the next-hop + */ + fib_node_index_t next_hop_fib_entry_index; + + /** + * The child index on the FIB entry + */ + u32 next_hop_child_index; + + /** + * The next DPO in the graph to follow. + */ + dpo_id_t dpo; + } lb_as_t; format_function_t format_lb_as; @@ -180,15 +201,13 @@ typedef struct { * in the adjacency index. */ u8 flags; +#define LB_VIP_FLAGS_USED 0x1 /** * Pool of AS indexes used for this VIP. * This also includes ASs that have been removed (but are still referenced). */ u32 *as_indexes; - -#define LB_VIP_FLAGS_USED 0x1 - } lb_vip_t; #define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 || (vip)->type == LB_VIP_TYPE_IP4_GRE4) @@ -261,6 +280,17 @@ typedef struct { vlib_simple_counter_main_t vip_counters[LB_N_VIP_COUNTERS]; /** + * DPO used to send packet from IP4/6 lookup to LB node. + */ + dpo_type_t dpo_gre4_type; + dpo_type_t dpo_gre6_type; + + /** + * Node type for registering to fib changes. + */ + fib_node_type_t fib_node_type; + + /** * API dynamically registered base ID. */ u16 msg_id_base; @@ -268,16 +298,6 @@ typedef struct { volatile u32 *writer_lock; } lb_main_t; -/** - * struct stored in adj->opaque data. - */ -typedef struct { - /** - * Index of the VIP associated with that IP adjacency. - */ - u32 vip_index; -} lb_adj_data_t; - extern lb_main_t lb_main; extern vlib_node_registration_t lb6_node; extern vlib_node_registration_t lb4_node; @@ -302,12 +322,6 @@ int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index); int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n); int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n); -/** - * Updates the adjacency index stored in the AS such that the second - * IP lookup (after encap) can be bypassed. - */ -int lb_as_lookup_bypass(u32 vip_index, ip46_address_t *address, u8 is_disable); - u32 lb_hash_time_now(vlib_main_t * vm); void lb_garbage_collection(); diff --git a/plugins/lb-plugin/lb/node.c b/plugins/lb-plugin/lb/node.c index c51a2108..77beaac9 100644 --- a/plugins/lb-plugin/lb/node.c +++ b/plugins/lb-plugin/lb/node.c @@ -36,28 +36,11 @@ static char *lb_error_strings[] = { #undef _ }; -typedef enum { - LB_NEXT_LOOKUP, - LB_NEXT_REWRITE, - LB_NEXT_DROP, - LB_N_NEXT, -} lb_next_t; - typedef struct { u32 vip_index; u32 as_index; } lb_trace_t; -/* u8 *lb_format_adjacency(u8 * s, va_list * va) */ -/* { */ -/* lb_main_t *lbm = &lb_main; */ -/* __attribute((unused)) ip_lookup_main_t *lm = va_arg (*va, ip_lookup_main_t *); */ -/* ip_adjacency_t *adj = va_arg (*va, ip_adjacency_t *); */ -/* lb_adj_data_t *ad = (lb_adj_data_t *) &adj->opaque; */ -/* __attribute__((unused)) lb_vip_t *vip = pool_elt_at_index (lbm->vips, ad->vip_index); */ -/* return format(s, "vip idx:%d", ad->vip_index); */ -/* } */ - u8 * format_lb_trace (u8 * s, va_list * args) { @@ -108,169 +91,162 @@ lb_node_fn (vlib_main_t * vm, u8 is_input_v4, //Compile-time parameter stating that is input is v4 (or v6) u8 is_encap_v4) //Compile-time parameter stating that is GRE encap is v4 (or v6) { - /* ip_lookup_main_t *lm = (is_input_v4)?&ip4_main.lookup_main:&ip6_main.lookup_main; */ - /* lb_main_t *lbm = &lb_main; */ - /* vlib_node_runtime_t *error_node = node; */ - /* u32 n_left_from, *from, next_index, *to_next, n_left_to_next; */ - /* u32 cpu_index = os_get_cpu_number(); */ - /* u32 lb_time = lb_hash_time_now(vm); */ - - /* lb_hash_t *sticky_ht = lb_get_sticky_table(cpu_index); */ - /* from = vlib_frame_vector_args (frame); */ - /* n_left_from = frame->n_vectors; */ - /* next_index = node->cached_next_index; */ - - /* while (n_left_from > 0) */ - /* { */ - /* vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); */ - /* while (n_left_from > 0 && n_left_to_next > 0) */ - /* { */ - /* u32 pi0; */ - /* vlib_buffer_t *p0; */ - /* ip_adjacency_t *adj0; */ - /* lb_adj_data_t *ad0; */ - /* lb_vip_t *vip0; */ - /* lb_as_t *as0; */ - /* gre_header_t *gre0; */ - /* u16 len0; */ - /* u32 value0, available_index0, hash0; */ - /* u64 key0[5]; */ - /* lb_error_t error0 = LB_ERROR_NONE; */ - /* lb_next_t next0 = LB_NEXT_LOOKUP; */ - - /* if (PREDICT_TRUE(n_left_from > 1)) */ - /* { */ - /* vlib_buffer_t *p2; */ - /* p2 = vlib_get_buffer(vm, from[1]); */ - /* vlib_prefetch_buffer_header(p2, STORE); */ - /* /\* IPv4 + 8 = 28. possibly plus -40 *\/ */ - /* CLIB_PREFETCH (vlib_buffer_get_current(p2) - 40, 128, STORE); */ - /* } */ - - /* pi0 = to_next[0] = from[0]; */ - /* from += 1; */ - /* n_left_from -= 1; */ - /* to_next += 1; */ - /* n_left_to_next -= 1; */ - - /* p0 = vlib_get_buffer (vm, pi0); */ - /* adj0 = ip_get_adjacency (lm, vnet_buffer (p0)->ip.adj_index[VLIB_TX]); */ - /* ad0 = (lb_adj_data_t *) &adj0->opaque; */ - /* vip0 = pool_elt_at_index (lbm->vips, ad0->vip_index); */ - - /* if (is_input_v4) { */ - /* ip4_header_t *ip40; */ - /* ip40 = vlib_buffer_get_current (p0); */ - /* len0 = clib_net_to_host_u16(ip40->length); */ - /* key0[0] = (u64) ip40->src_address.as_u32; */ - /* key0[1] = (u64) ip40->dst_address.as_u32; */ - /* key0[2] = 0; */ - /* key0[3] = 0; */ - /* key0[4] = ((u64)((udp_header_t *)(ip40 + 1))->src_port << 32) | */ - /* ((u64)((udp_header_t *)(ip40 + 1))->dst_port << 16); */ - - /* hash0 = lb_hash_hash(key0); */ - /* } else { */ - /* ip6_header_t *ip60; */ - /* ip60 = vlib_buffer_get_current (p0); */ - /* len0 = clib_net_to_host_u16(ip60->payload_length) + sizeof(ip6_header_t); */ - /* key0[0] = ip60->src_address.as_u64[0]; */ - /* key0[1] = ip60->src_address.as_u64[1]; */ - /* key0[2] = ip60->dst_address.as_u64[0]; */ - /* key0[3] = ip60->dst_address.as_u64[1]; */ - /* key0[4] = ((u64)((udp_header_t *)(ip60 + 1))->src_port << 32) | */ - /* ((u64)((udp_header_t *)(ip60 + 1))->dst_port << 16); */ - - /* hash0 = lb_hash_hash(key0); */ - /* } */ - - /* //NOTE: This is an ugly trick to not include the VIP index in the hash calculation */ - /* //but actually use it in the key determination. */ - /* key0[4] |= ((vip0 - lbm->vips)); */ - - /* lb_hash_get(sticky_ht, key0, hash0, lb_time, &available_index0, &value0); */ - /* if (PREDICT_TRUE(value0 != ~0)) { */ - /* //Found an existing entry */ - /* as0 = &lbm->ass[value0]; */ - /* } else if (PREDICT_TRUE(available_index0 != ~0)) { */ - /* //There is an available slot for a new flow */ - /* as0 = &lbm->ass[vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index]; */ - /* if (PREDICT_FALSE(as0 == lbm->ass)) { //Special first element */ - /* error0 = LB_ERROR_NO_SERVER; */ - /* next0 = LB_NEXT_DROP; */ - /* } else { */ - /* vlib_increment_simple_counter(&lbm->vip_counters[LB_VIP_COUNTER_TRACKED_SESSION], */ - /* cpu_index, vip0 - lbm->vips, 1); */ - /* } */ - - /* //TODO: There are race conditions with as0 and vip0 manipulation. */ - /* //Configuration may be changed, vectors resized, etc... */ - - /* //Dereference previously used */ - /* vlib_refcount_add(&lbm->as_refcount, cpu_index, lb_hash_available_value(sticky_ht, available_index0), -1); */ - /* vlib_refcount_add(&lbm->as_refcount, cpu_index, as0 - lbm->ass, 1); */ - - /* //Add sticky entry */ - /* //Note that when there is no AS configured, an entry is configured anyway. */ - /* //But no configured AS is not something that should happen */ - /* lb_hash_put(sticky_ht, key0, as0 - lbm->ass, available_index0, lb_time); */ - /* } else { */ - /* //Could not store new entry in the table */ - /* as0 = &lbm->ass[vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index]; */ - /* vlib_increment_simple_counter(&lbm->vip_counters[LB_VIP_COUNTER_UNTRACKED_PACKET], */ - /* cpu_index, vip0 - lbm->vips, 1); */ - /* } */ - - /* //Now let's encap */ - /* if (is_encap_v4) { */ - /* ip4_header_t *ip40; */ - /* vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t)); */ - /* ip40 = vlib_buffer_get_current(p0); */ - /* gre0 = (gre_header_t *)(ip40 + 1); */ - /* ip40->src_address = lbm->ip4_src_address; */ - /* ip40->dst_address = as0->address.ip4; */ - /* ip40->ip_version_and_header_length = 0x45; */ - /* ip40->ttl = 128; */ - /* ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t)); */ - /* ip40->protocol = IP_PROTOCOL_GRE; */ - /* ip40->checksum = ip4_header_checksum (ip40); */ - /* } else { */ - /* ip6_header_t *ip60; */ - /* vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t)); */ - /* ip60 = vlib_buffer_get_current(p0); */ - /* gre0 = (gre_header_t *)(ip60 + 1); */ - /* ip60->dst_address = as0->address.ip6; */ - /* ip60->src_address = lbm->ip6_src_address; */ - /* ip60->hop_limit = 128; */ - /* ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28); */ - /* ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t)); */ - /* ip60->protocol = IP_PROTOCOL_GRE; */ - /* } */ - - /* gre0->flags_and_version = 0; */ - /* gre0->protocol = (is_input_v4)? */ - /* clib_host_to_net_u16(0x0800): */ - /* clib_host_to_net_u16(0x86DD); */ - - /* vnet_buffer(p0)->ip.adj_index[VLIB_TX] = as0->adj_index; */ - /* next0 = (as0->adj_index != ~0)?LB_NEXT_REWRITE:next0; */ - - /* if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) */ - /* { */ - /* lb_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof (*tr)); */ - /* tr->as_index = as0 - lbm->ass; */ - /* tr->vip_index = ad0->vip_index; */ - /* } */ - - /* p0->error = error_node->errors[error0]; */ - /* vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, */ - /* n_left_to_next, pi0, next0); */ - /* } */ - /* vlib_put_next_frame (vm, node, next_index, n_left_to_next); */ - /* } */ - - /* return frame->n_vectors; */ - return 0; + lb_main_t *lbm = &lb_main; + vlib_node_runtime_t *error_node = node; + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + u32 cpu_index = os_get_cpu_number(); + u32 lb_time = lb_hash_time_now(vm); + + lb_hash_t *sticky_ht = lb_get_sticky_table(cpu_index); + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 pi0; + vlib_buffer_t *p0; + lb_vip_t *vip0; + lb_as_t *as0; + gre_header_t *gre0; + u16 len0; + u32 value0, available_index0, hash0; + u64 key0[5]; + lb_error_t error0 = LB_ERROR_NONE; + + if (PREDICT_TRUE(n_left_from > 1)) + { + vlib_buffer_t *p2; + p2 = vlib_get_buffer(vm, from[1]); + vlib_prefetch_buffer_header(p2, STORE); + /* IPv4 + 8 = 28. possibly plus -40 */ + CLIB_PREFETCH (vlib_buffer_get_current(p2) - 40, 128, STORE); + } + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + vip0 = pool_elt_at_index (lbm->vips, + vnet_buffer (p0)->ip.adj_index[VLIB_TX]); + + if (is_input_v4) { + ip4_header_t *ip40; + ip40 = vlib_buffer_get_current (p0); + len0 = clib_net_to_host_u16(ip40->length); + key0[0] = (u64) ip40->src_address.as_u32; + key0[1] = (u64) ip40->dst_address.as_u32; + key0[2] = 0; + key0[3] = 0; + key0[4] = ((u64)((udp_header_t *)(ip40 + 1))->src_port << 32) | + ((u64)((udp_header_t *)(ip40 + 1))->dst_port << 16); + + hash0 = lb_hash_hash(key0); + } else { + ip6_header_t *ip60; + ip60 = vlib_buffer_get_current (p0); + len0 = clib_net_to_host_u16(ip60->payload_length) + sizeof(ip6_header_t); + key0[0] = ip60->src_address.as_u64[0]; + key0[1] = ip60->src_address.as_u64[1]; + key0[2] = ip60->dst_address.as_u64[0]; + key0[3] = ip60->dst_address.as_u64[1]; + key0[4] = ((u64)((udp_header_t *)(ip60 + 1))->src_port << 32) | + ((u64)((udp_header_t *)(ip60 + 1))->dst_port << 16); + + hash0 = lb_hash_hash(key0); + } + + //NOTE: This is an ugly trick to not include the VIP index in the hash calculation + //but actually use it in the key determination. + key0[4] |= ((vip0 - lbm->vips)); + + lb_hash_get(sticky_ht, key0, hash0, lb_time, &available_index0, &value0); + if (PREDICT_TRUE(value0 != ~0)) { + //Found an existing entry + as0 = &lbm->ass[value0]; + } else if (PREDICT_TRUE(available_index0 != ~0)) { + //There is an available slot for a new flow + as0 = &lbm->ass[vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index]; + if (PREDICT_FALSE(as0 == lbm->ass)) { //Special first element + error0 = LB_ERROR_NO_SERVER; + } else { + vlib_increment_simple_counter(&lbm->vip_counters[LB_VIP_COUNTER_TRACKED_SESSION], + cpu_index, vip0 - lbm->vips, 1); + } + + //TODO: There are race conditions with as0 and vip0 manipulation. + //Configuration may be changed, vectors resized, etc... + + //Dereference previously used + vlib_refcount_add(&lbm->as_refcount, cpu_index, lb_hash_available_value(sticky_ht, available_index0), -1); + vlib_refcount_add(&lbm->as_refcount, cpu_index, as0 - lbm->ass, 1); + + //Add sticky entry + //Note that when there is no AS configured, an entry is configured anyway. + //But no configured AS is not something that should happen + lb_hash_put(sticky_ht, key0, as0 - lbm->ass, available_index0, lb_time); + } else { + //Could not store new entry in the table + as0 = &lbm->ass[vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index]; + vlib_increment_simple_counter(&lbm->vip_counters[LB_VIP_COUNTER_UNTRACKED_PACKET], + cpu_index, vip0 - lbm->vips, 1); + } + + //Now let's encap + if (is_encap_v4) { + ip4_header_t *ip40; + vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t)); + ip40 = vlib_buffer_get_current(p0); + gre0 = (gre_header_t *)(ip40 + 1); + ip40->src_address = lbm->ip4_src_address; + ip40->dst_address = as0->address.ip4; + ip40->ip_version_and_header_length = 0x45; + ip40->ttl = 128; + ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t)); + ip40->protocol = IP_PROTOCOL_GRE; + ip40->checksum = ip4_header_checksum (ip40); + } else { + ip6_header_t *ip60; + vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t)); + ip60 = vlib_buffer_get_current(p0); + gre0 = (gre_header_t *)(ip60 + 1); + ip60->dst_address = as0->address.ip6; + ip60->src_address = lbm->ip6_src_address; + ip60->hop_limit = 128; + ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28); + ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t)); + ip60->protocol = IP_PROTOCOL_GRE; + } + + gre0->flags_and_version = 0; + gre0->protocol = (is_input_v4)? + clib_host_to_net_u16(0x0800): + clib_host_to_net_u16(0x86DD); + + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = as0->dpo.dpoi_index; + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + lb_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof (*tr)); + tr->as_index = as0 - lbm->ass; + tr->vip_index = vip0 - lbm->vips; + } + + p0->error = error_node->errors[error0]; + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, pi0, + as0->dpo.dpoi_next_node); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; } static uword @@ -314,18 +290,10 @@ VLIB_REGISTER_NODE (lb6_gre6_node) = .n_next_nodes = LB_N_NEXT, .next_nodes = { - [LB_NEXT_LOOKUP] = "ip6-lookup", - [LB_NEXT_REWRITE] = "ip6-rewrite", [LB_NEXT_DROP] = "error-drop" }, }; -/* VNET_IP6_REGISTER_ADJACENCY(lb6_gre6) = { */ -/* .node_name = "lb6-gre6", */ -/* .fn = lb_format_adjacency, */ -/* .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP6_GRE6] */ -/* }; */ - VLIB_REGISTER_NODE (lb6_gre4_node) = { .function = lb6_gre4_node_fn, @@ -339,18 +307,10 @@ VLIB_REGISTER_NODE (lb6_gre4_node) = .n_next_nodes = LB_N_NEXT, .next_nodes = { - [LB_NEXT_LOOKUP] = "ip4-lookup", - [LB_NEXT_REWRITE]= "ip4-rewrite-transit", [LB_NEXT_DROP] = "error-drop" }, }; -/* VNET_IP6_REGISTER_ADJACENCY(lb6_gre4) = { */ -/* .node_name = "lb6-gre4", */ -/* .fn = lb_format_adjacency, */ -/* .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP6_GRE4] */ -/* }; */ - VLIB_REGISTER_NODE (lb4_gre6_node) = { .function = lb4_gre6_node_fn, @@ -364,18 +324,10 @@ VLIB_REGISTER_NODE (lb4_gre6_node) = .n_next_nodes = LB_N_NEXT, .next_nodes = { - [LB_NEXT_LOOKUP] = "ip6-lookup", - [LB_NEXT_REWRITE] = "ip6-rewrite", [LB_NEXT_DROP] = "error-drop" }, }; -/* VNET_IP4_REGISTER_ADJACENCY(lb4_gre6) = { */ -/* .node_name = "lb4-gre6", */ -/* .fn = lb_format_adjacency, */ -/* .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP4_GRE6] */ -/* }; */ - VLIB_REGISTER_NODE (lb4_gre4_node) = { .function = lb4_gre4_node_fn, @@ -389,14 +341,7 @@ VLIB_REGISTER_NODE (lb4_gre4_node) = .n_next_nodes = LB_N_NEXT, .next_nodes = { - [LB_NEXT_LOOKUP] = "ip4-lookup", - [LB_NEXT_REWRITE]= "ip4-rewrite-transit", [LB_NEXT_DROP] = "error-drop" }, }; -/* VNET_IP4_REGISTER_ADJACENCY(lb4_gre4) = { */ -/* .node_name = "lb4-gre4", */ -/* .fn = lb_format_adjacency, */ -/* .next_index = &lb_main.ip_lookup_next_index[LB_VIP_TYPE_IP4_GRE4] */ -/* }; */ |