From 3590ac5881261c95a3c575360e24903d60fac392 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Mon, 8 Aug 2016 16:04:26 +0200 Subject: VPP-196 LISP L2/L3 tunnel multihoming Change-Id: If96d9ff23a7aacdb684494f854d4029f55837065 Signed-off-by: Florin Coras --- vnet/vnet/lisp-cp/control.c | 54 +++++---- vnet/vnet/lisp-cp/gid_dictionary.c | 2 +- vnet/vnet/lisp-cp/lisp_types.h | 8 +- vnet/vnet/lisp-gpe/interface.c | 137 ++++++++++++++++------ vnet/vnet/lisp-gpe/ip_forward.c | 53 ++++++++- vnet/vnet/lisp-gpe/lisp_gpe.c | 230 +++++++++++++++++++++++++++++-------- vnet/vnet/lisp-gpe/lisp_gpe.h | 36 +++++- 7 files changed, 407 insertions(+), 113 deletions(-) (limited to 'vnet') diff --git a/vnet/vnet/lisp-cp/control.c b/vnet/vnet/lisp-cp/control.c index 4424b60199a..c28276562c8 100644 --- a/vnet/vnet/lisp-cp/control.c +++ b/vnet/vnet/lisp-cp/control.c @@ -239,13 +239,13 @@ dp_del_fwd_entry (lisp_cp_main_t * lcm, u32 src_map_index, u32 dst_map_index) * */ static u32 -get_locator_pair (lisp_cp_main_t* lcm, mapping_t * lcl_map, mapping_t * rmt_map, - locator_pair_t ** locator_pairs) +get_locator_pairs (lisp_cp_main_t* lcm, mapping_t * lcl_map, + mapping_t * rmt_map, locator_pair_t ** locator_pairs) { - u32 i, minp = ~0, limitp = 0, li, check_index = 0, done = 0, esi; + u32 i, limitp = 0, li, found = 0, esi; locator_set_t * rmt_ls, * lcl_ls; - ip_address_t _lcl, * lcl = &_lcl; - locator_t * l, * rmt = 0; + ip_address_t _lcl_addr, * lcl_addr = &_lcl_addr; + locator_t * lp, * rmt = 0; uword * checked = 0; locator_pair_t pair; @@ -255,7 +255,7 @@ get_locator_pair (lisp_cp_main_t* lcm, mapping_t * lcl_map, mapping_t * rmt_map, if (!rmt_ls || vec_len(rmt_ls->locator_indices) == 0) return 0; - while (!done) + while (1) { rmt = 0; @@ -266,22 +266,28 @@ get_locator_pair (lisp_cp_main_t* lcm, mapping_t * lcl_map, mapping_t * rmt_map, continue; li = vec_elt(rmt_ls->locator_indices, i); - l = pool_elt_at_index(lcm->locator_pool, li); + lp = pool_elt_at_index(lcm->locator_pool, li); /* we don't support non-IP locators for now */ - if (gid_address_type(&l->address) != GID_ADDR_IP_PREFIX) + if (gid_address_type(&lp->address) != GID_ADDR_IP_PREFIX) continue; - if (l->priority < minp && l->priority >= limitp) + if ((found && lp->priority == limitp) + || (!found && lp->priority >= limitp)) { - minp = l->priority; - rmt = l; - check_index = i; + rmt = lp; + + /* don't search for locators with lower priority and don't + * check this locator again*/ + limitp = lp->priority; + hash_set(checked, i, 1); + break; } } /* check if a local locator with a route to remote locator exists */ if (rmt != 0) { + /* find egress sw_if_index for rmt locator */ esi = ip_fib_get_egress_iface_for_dst ( lcm, &gid_address_ip(&rmt->address)); if ((u32) ~0 == esi) @@ -292,31 +298,31 @@ get_locator_pair (lisp_cp_main_t* lcm, mapping_t * lcl_map, mapping_t * rmt_map, li = vec_elt (lcl_ls->locator_indices, i); locator_t * sl = pool_elt_at_index (lcm->locator_pool, li); - /* found local locator */ + /* found local locator with the needed sw_if_index*/ if (sl->sw_if_index == esi) { + /* and it has an address */ if (0 == ip_interface_get_first_ip_address (lcm, sl->sw_if_index, - gid_address_ip_version(&rmt->address), lcl)) + gid_address_ip_version(&rmt->address), lcl_addr)) continue; memset(&pair, 0, sizeof(pair)); - ip_address_copy(&pair.rmt_loc, &gid_address_ip(&rmt->address)); - ip_address_copy(&pair.lcl_loc, lcl); + ip_address_copy (&pair.rmt_loc, + &gid_address_ip(&rmt->address)); + ip_address_copy(&pair.lcl_loc, lcl_addr); + pair.weight = rmt->weight; vec_add1(locator_pairs[0], pair); - done = 2; + found = 1; } } - - /* skip this remote locator in next searches */ - limitp = minp; - hash_set(checked, check_index, 1); } else - done = 1; + break; } + hash_free(checked); - return (done == 2) ? 1 : 0; + return found; } static void @@ -369,7 +375,7 @@ dp_add_fwd_entry (lisp_cp_main_t* lcm, u32 src_map_index, u32 dst_map_index) } /* find best locator pair that 1) verifies LISP policy 2) are connected */ - if (0 == get_locator_pair (lcm, src_map, dst_map, &a->locator_pairs)) + if (0 == get_locator_pairs (lcm, src_map, dst_map, &a->locator_pairs)) { /* negative entry */ a->is_negative = 1; diff --git a/vnet/vnet/lisp-cp/gid_dictionary.c b/vnet/vnet/lisp-cp/gid_dictionary.c index a6699b99f16..d4875f25d41 100644 --- a/vnet/vnet/lisp-cp/gid_dictionary.c +++ b/vnet/vnet/lisp-cp/gid_dictionary.c @@ -21,7 +21,7 @@ mac_lookup (gid_dictionary_t * db, u32 vni, u8 * key) int rv; BVT(clib_bihash_kv) kv, value; - kv.key[0] = ((u64 *)key)[0] & MAC_BIT_MASK; + kv.key[0] = mac_to_u64(key); kv.key[1] = (u64)vni; kv.key[2] = 0; diff --git a/vnet/vnet/lisp-cp/lisp_types.h b/vnet/vnet/lisp-cp/lisp_types.h index 2587fce526b..d0ca6e0bf5d 100644 --- a/vnet/vnet/lisp-cp/lisp_types.h +++ b/vnet/vnet/lisp-cp/lisp_types.h @@ -231,7 +231,11 @@ void _n ## _copy (void * dst , void * src); foreach_gid_address_type_fcns #undef _ -#define MAC_BIT_MASK (((u64)1 << 48) - 1) +always_inline u64 +mac_to_u64(u8 *m) +{ + return (*((u64 *)m) & 0xffffffffffff); +} typedef struct { @@ -286,7 +290,7 @@ typedef struct locator_pair ip_address_t lcl_loc; ip_address_t rmt_loc; - u8 priority; + u8 priority; /* TODO remove */ u8 weight; } locator_pair_t; diff --git a/vnet/vnet/lisp-gpe/interface.c b/vnet/vnet/lisp-gpe/interface.c index 77ad9a33171..c7bba47b1c2 100644 --- a/vnet/vnet/lisp-gpe/interface.c +++ b/vnet/vnet/lisp-gpe/interface.c @@ -73,19 +73,25 @@ get_one_tunnel_inline (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, always_inline void encap_one_inline (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, - lisp_gpe_tunnel_t * t0, u32 * next0, u8 is_v4) + lisp_gpe_tunnel_t * t0, u32 * next0) { ASSERT(sizeof(ip4_udp_lisp_gpe_header_t) == 36); ASSERT(sizeof(ip6_udp_lisp_gpe_header_t) == 56); - if (is_v4) + lisp_gpe_sub_tunnel_t * st0; + u32 * sti0; + + sti0 = vec_elt_at_index(t0->sub_tunnels_lbv, + vnet_buffer(b0)->ip.flow_hash % t0->sub_tunnels_lbv_count); + st0 = vec_elt_at_index(t0->sub_tunnels, sti0[0]); + if (st0->is_ip4) { - ip_udp_encap_one (lgm->vlib_main, b0, t0->rewrite, 36, 1); + ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 36, 1); next0[0] = LISP_GPE_TX_NEXT_IP4_LOOKUP; } else { - ip_udp_encap_one (lgm->vlib_main, b0, t0->rewrite, 56, 0); + ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 56, 0); next0[0] = LISP_GPE_TX_NEXT_IP6_LOOKUP; } @@ -129,22 +135,51 @@ get_two_tunnels_inline (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, always_inline void encap_two_inline (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, vlib_buffer_t * b1, lisp_gpe_tunnel_t * t0, lisp_gpe_tunnel_t * t1, u32 * next0, - u32 * next1, u8 is_v4) + u32 * next1) { ASSERT(sizeof(ip4_udp_lisp_gpe_header_t) == 36); ASSERT(sizeof(ip6_udp_lisp_gpe_header_t) == 56); - if (is_v4) + lisp_gpe_sub_tunnel_t * st0, * st1; + u32 * sti0, * sti1; + sti0 = vec_elt_at_index(t0->sub_tunnels_lbv, + vnet_buffer(b0)->ip.flow_hash % t0->sub_tunnels_lbv_count); + sti1 = vec_elt_at_index(t1->sub_tunnels_lbv, + vnet_buffer(b1)->ip.flow_hash % t1->sub_tunnels_lbv_count); + st0 = vec_elt_at_index(t0->sub_tunnels, sti0[0]); + st1 = vec_elt_at_index(t1->sub_tunnels, sti1[0]); + + if (PREDICT_TRUE(st0->is_ip4 == st1->is_ip4)) { - ip_udp_encap_one (lgm->vlib_main, b0, t0->rewrite, 36, 1); - ip_udp_encap_one (lgm->vlib_main, b1, t1->rewrite, 36, 1); - next0[0] = next1[0] = LISP_GPE_TX_NEXT_IP4_LOOKUP; + if (st0->is_ip4) + { + ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 36, 1); + ip_udp_encap_one (lgm->vlib_main, b1, st1->rewrite, 36, 1); + next0[0] = next1[0] = LISP_GPE_TX_NEXT_IP4_LOOKUP; + } + else + { + ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 56, 0); + ip_udp_encap_one (lgm->vlib_main, b1, st1->rewrite, 56, 0); + next0[0] = next1[0] = LISP_GPE_TX_NEXT_IP6_LOOKUP; + } } else { - ip_udp_encap_one (lgm->vlib_main, b0, t0->rewrite, 56, 0); - ip_udp_encap_one (lgm->vlib_main, b1, t1->rewrite, 56, 0); - next0[0] = next1[0] = LISP_GPE_TX_NEXT_IP6_LOOKUP; + if (st0->is_ip4) + { + ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 36, 1); + ip_udp_encap_one (lgm->vlib_main, b1, st1->rewrite, 56, 1); + next0[0] = LISP_GPE_TX_NEXT_IP4_LOOKUP; + next1[0] = LISP_GPE_TX_NEXT_IP6_LOOKUP; + } + else + { + ip_udp_encap_one (lgm->vlib_main, b0, st0->rewrite, 56, 1); + ip_udp_encap_one (lgm->vlib_main, b1, st1->rewrite, 36, 1); + next0[0] = LISP_GPE_TX_NEXT_IP6_LOOKUP; + next1[0] = LISP_GPE_TX_NEXT_IP4_LOOKUP; + } } /* Reset to look up tunnel partner in the configured FIB */ @@ -223,19 +258,7 @@ lisp_gpe_interface_tx (vlib_main_t * vm, vlib_node_runtime_t * node, get_one_tunnel_inline (lgm, b1, &t1, is_v4_eid1 ? 1 : 0); } - if (PREDICT_TRUE( - ip_addr_version(&t0->dst) == ip_addr_version(&t1->dst))) - { - encap_two_inline (lgm, b0, b1, t0, t1, &next0, &next1, - ip_addr_version(&t0->dst) == IP4 ? 1 : 0); - } - else - { - encap_one_inline (lgm, b0, t0, &next0, - ip_addr_version(&t0->dst) == IP4 ? 1 : 0); - encap_one_inline (lgm, b1, t1, &next1, - ip_addr_version(&t1->dst) == IP4 ? 1 : 0); - } + encap_two_inline (lgm, b0, b1, t0, t1, &next0, &next1); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -274,8 +297,7 @@ lisp_gpe_interface_tx (vlib_main_t * vm, vlib_node_runtime_t * node, is_v4_0 = is_v4_packet(vlib_buffer_get_current (b0)); get_one_tunnel_inline (lgm, b0, &t0, is_v4_0 ? 1 : 0); - encap_one_inline (lgm, b0, t0, &next0, - ip_addr_version(&t0->dst) == IP4 ? 1 : 0); + encap_one_inline (lgm, b0, t0, &next0); if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) { @@ -517,6 +539,44 @@ l2_process_tunnel_action (vlib_buffer_t * b0, u8 action, u32 * next0) } } +always_inline u32 +ip_flow_hash (void * data) +{ + ip4_header_t * iph = (ip4_header_t *) data; + + if ((iph->ip_version_and_header_length & 0xF0) == 0x40) + return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT); + else + return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT); +} + +always_inline u32 +l2_flow_hash (vlib_buffer_t * b0) +{ + ethernet_header_t * eh; + u64 a, b, c; + uword is_ip, eh_size; + u16 eh_type; + + eh = vlib_buffer_get_current (b0); + eh_type = clib_net_to_host_u16(eh->type); + eh_size = ethernet_buffer_header_size(b0); + + is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6); + + /* since we have 2 cache lines, use them */ + if (is_ip) + a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size); + else + a = eh->type; + + b = mac_to_u64((u8 *)eh->dst_address); + c = mac_to_u64((u8 *)eh->src_address); + hash_mix64 (a, b, c); + + return (u32) c; +} + always_inline void l2_process_one (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, u32 ti0, u32 * next0) { @@ -527,8 +587,10 @@ l2_process_one (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, u32 ti0, u32 * next0) if (PREDICT_TRUE(LISP_NO_ACTION == t0->action)) { - encap_one_inline (lgm, b0, t0, next0, - ip_addr_version(&t0->dst) == IP4 ? 1 : 0); + /* compute 'flow' hash */ + if (PREDICT_TRUE(t0->sub_tunnels_lbv_count > 1)) + vnet_buffer(b0)->ip.flow_hash = l2_flow_hash (b0); + encap_one_inline (lgm, b0, t0, next0); } else { @@ -550,21 +612,26 @@ l2_process_two (lisp_gpe_main_t * lgm, vlib_buffer_t * b0, vlib_buffer_t * b1, if (PREDICT_TRUE(LISP_NO_ACTION == t0->action && LISP_NO_ACTION == t1->action)) { - encap_two_inline (lgm, b0, b1, t0, t1, next0, next1, - ip_addr_version(&t0->dst) == IP4 ? 1 : 0); + if (PREDICT_TRUE(t0->sub_tunnels_lbv_count > 1)) + vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0); + if (PREDICT_TRUE(t1->sub_tunnels_lbv_count > 1)) + vnet_buffer(b1)->ip.flow_hash = l2_flow_hash(b1); + encap_two_inline (lgm, b0, b1, t0, t1, next0, next1); } else { if (LISP_NO_ACTION == t0->action) { - encap_one_inline (lgm, b0, t0, next0, - ip_addr_version(&t0->dst) == IP4 ? 1 : 0); + if (PREDICT_TRUE(t0->sub_tunnels_lbv_count > 1)) + vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0); + encap_one_inline (lgm, b0, t0, next0); l2_process_tunnel_action (b1, t1->action, next1); } else if (LISP_NO_ACTION == t1->action) { - encap_one_inline (lgm, b1, t1, next1, - ip_addr_version(&t1->dst) == IP4 ? 1 : 0); + if (PREDICT_TRUE(t1->sub_tunnels_lbv_count > 1)) + vnet_buffer(b1)->ip.flow_hash = l2_flow_hash(b1); + encap_one_inline (lgm, b1, t1, next1); l2_process_tunnel_action (b0, t0->action, next0); } else diff --git a/vnet/vnet/lisp-gpe/ip_forward.c b/vnet/vnet/lisp-gpe/ip_forward.c index 607687305c5..47f3f7b3755 100644 --- a/vnet/vnet/lisp-gpe/ip_forward.c +++ b/vnet/vnet/lisp-gpe/ip_forward.c @@ -832,8 +832,8 @@ lgpe_ip4_lookup (vlib_main_t * vm, vlib_node_runtime_t * node, u32 bi0, bi1; vlib_buffer_t * b0, * b1; ip4_header_t * ip0, * ip1; - u32 dst_adj_index0, src_adj_index0, src_fib_index0, dst_adj_index1, - src_adj_index1, src_fib_index1; + u32 dst_adj_index0, src_adj_index0, src_fib_index0; + u32 dst_adj_index1, src_adj_index1, src_fib_index1; ip_adjacency_t * dst_adj0, * src_adj0, * dst_adj1, * src_adj1; u32 next0, next1; @@ -900,6 +900,17 @@ lgpe_ip4_lookup (vlib_main_t * vm, vlib_node_runtime_t * node, src_adj0->rewrite_header.sw_if_index; vnet_buffer (b1)->sw_if_index[VLIB_TX] = src_adj1->rewrite_header.sw_if_index; + + /* if multipath: saved_lookup_next_index is reused to store + * nb of sub-tunnels. If greater than 1, multipath is on. + * Note that flow hash should be 0 after ipx lookup! */ + if (PREDICT_TRUE(src_adj0->saved_lookup_next_index > 1)) + vnet_buffer (b0)->ip.flow_hash = ip4_compute_flow_hash ( + ip0, IP_FLOW_HASH_DEFAULT); + + if (PREDICT_TRUE(src_adj1->saved_lookup_next_index > 1)) + vnet_buffer (b1)->ip.flow_hash = ip4_compute_flow_hash ( + ip1, IP_FLOW_HASH_DEFAULT); } else { @@ -910,6 +921,10 @@ lgpe_ip4_lookup (vlib_main_t * vm, vlib_node_runtime_t * node, next0 = src_adj0->explicit_fib_index; vnet_buffer (b0)->sw_if_index[VLIB_TX] = src_adj0->rewrite_header.sw_if_index; + + if (PREDICT_TRUE(src_adj0->saved_lookup_next_index > 1)) + vnet_buffer (b0)->ip.flow_hash = ip4_compute_flow_hash ( + ip0, IP_FLOW_HASH_DEFAULT); } else { @@ -923,6 +938,9 @@ lgpe_ip4_lookup (vlib_main_t * vm, vlib_node_runtime_t * node, next1 = src_adj1->explicit_fib_index; vnet_buffer (b1)->sw_if_index[VLIB_TX] = src_adj1->rewrite_header.sw_if_index; + if (PREDICT_TRUE(src_adj1->saved_lookup_next_index > 1)) + vnet_buffer (b1)->ip.flow_hash = ip4_compute_flow_hash ( + ip1, IP_FLOW_HASH_DEFAULT); } else { @@ -978,6 +996,12 @@ lgpe_ip4_lookup (vlib_main_t * vm, vlib_node_runtime_t * node, /* prepare packet for lisp-gpe output node */ vnet_buffer (b0)->sw_if_index[VLIB_TX] = src_adj0->rewrite_header.sw_if_index; + + /* if multipath: saved_lookup_next_index is reused to store + * nb of sub-tunnels. If greater than 1, multipath is on */ + if (PREDICT_TRUE(src_adj0->saved_lookup_next_index > 1)) + vnet_buffer (b0)->ip.flow_hash = ip4_compute_flow_hash ( + ip0, IP_FLOW_HASH_DEFAULT); } else { @@ -1163,6 +1187,17 @@ lgpe_ip6_lookup (vlib_main_t * vm, vlib_node_runtime_t * node, src_adj0->rewrite_header.sw_if_index; vnet_buffer (b1)->sw_if_index[VLIB_TX] = src_adj1->rewrite_header.sw_if_index; + + /* if multipath: saved_lookup_next_index is reused to store + * nb of sub-tunnels. If greater than 1, multipath is on. + * Note that flow hash should be 0 after ipx lookup! */ + if (PREDICT_TRUE(src_adj0->saved_lookup_next_index > 1)) + vnet_buffer (b0)->ip.flow_hash = ip6_compute_flow_hash ( + ip0, IP_FLOW_HASH_DEFAULT); + + if (PREDICT_TRUE(src_adj1->saved_lookup_next_index > 1)) + vnet_buffer (b1)->ip.flow_hash = ip6_compute_flow_hash ( + ip1, IP_FLOW_HASH_DEFAULT); } else { @@ -1173,6 +1208,10 @@ lgpe_ip6_lookup (vlib_main_t * vm, vlib_node_runtime_t * node, next0 = src_adj0->explicit_fib_index; vnet_buffer (b0)->sw_if_index[VLIB_TX] = src_adj0->rewrite_header.sw_if_index; + + if (PREDICT_TRUE(src_adj0->saved_lookup_next_index > 1)) + vnet_buffer (b0)->ip.flow_hash = ip6_compute_flow_hash ( + ip0, IP_FLOW_HASH_DEFAULT); } else { @@ -1186,6 +1225,10 @@ lgpe_ip6_lookup (vlib_main_t * vm, vlib_node_runtime_t * node, next1 = src_adj1->explicit_fib_index; vnet_buffer (b1)->sw_if_index[VLIB_TX] = src_adj1->rewrite_header.sw_if_index; + + if (PREDICT_TRUE(src_adj1->saved_lookup_next_index > 1)) + vnet_buffer (b1)->ip.flow_hash = ip6_compute_flow_hash ( + ip1, IP_FLOW_HASH_DEFAULT); } else { @@ -1241,6 +1284,12 @@ lgpe_ip6_lookup (vlib_main_t * vm, vlib_node_runtime_t * node, /* prepare packet for lisp-gpe output node */ vnet_buffer (b0)->sw_if_index[VLIB_TX] = src_adj0->rewrite_header.sw_if_index; + + /* if multipath: saved_lookup_next_index is reused to store + * nb of sub-tunnels. If greater than 1, multipath is on */ + if (PREDICT_TRUE(src_adj0->saved_lookup_next_index > 1)) + vnet_buffer (b0)->ip.flow_hash = ip6_compute_flow_hash ( + ip0, IP_FLOW_HASH_DEFAULT); } else { diff --git a/vnet/vnet/lisp-gpe/lisp_gpe.c b/vnet/vnet/lisp-gpe/lisp_gpe.c index e5d3500fb06..922788538a3 100644 --- a/vnet/vnet/lisp-gpe/lisp_gpe.c +++ b/vnet/vnet/lisp-gpe/lisp_gpe.c @@ -14,17 +14,19 @@ */ #include +#include lisp_gpe_main_t lisp_gpe_main; static int -lisp_gpe_rewrite (lisp_gpe_tunnel_t * t) +lisp_gpe_rewrite (lisp_gpe_tunnel_t * t, lisp_gpe_sub_tunnel_t * st, + locator_pair_t * lp) { u8 *rw = 0; lisp_gpe_header_t * lisp0; int len; - if (ip_addr_version(&t->src) == IP4) + if (ip_addr_version(&lp->lcl_loc) == IP4) { ip4_header_t * ip0; ip4_udp_lisp_gpe_header_t * h0; @@ -41,8 +43,8 @@ lisp_gpe_rewrite (lisp_gpe_tunnel_t * t) ip0->protocol = IP_PROTOCOL_UDP; /* we fix up the ip4 header length and checksum after-the-fact */ - ip_address_copy_addr(&ip0->src_address, &t->src); - ip_address_copy_addr(&ip0->dst_address, &t->dst); + ip_address_copy_addr(&ip0->src_address, &lp->lcl_loc); + ip_address_copy_addr(&ip0->dst_address, &lp->rmt_loc); ip0->checksum = ip4_header_checksum (ip0); /* UDP header, randomize src port on something, maybe? */ @@ -70,8 +72,8 @@ lisp_gpe_rewrite (lisp_gpe_tunnel_t * t) ip0->protocol = IP_PROTOCOL_UDP; /* we fix up the ip6 header length after-the-fact */ - ip_address_copy_addr(&ip0->src_address, &t->src); - ip_address_copy_addr(&ip0->dst_address, &t->dst); + ip_address_copy_addr(&ip0->src_address, &lp->lcl_loc); + ip_address_copy_addr(&ip0->dst_address, &lp->rmt_loc); /* UDP header, randomize src port on something, maybe? */ h0->udp.src_port = clib_host_to_net_u16 (4341); @@ -87,10 +89,133 @@ lisp_gpe_rewrite (lisp_gpe_tunnel_t * t) lisp0->next_protocol = t->next_protocol; lisp0->iid = clib_host_to_net_u32 (t->vni); - t->rewrite = rw; + st->is_ip4 = ip_addr_version(&lp->lcl_loc) == IP4; + st->rewrite = rw; return 0; } +static int +weight_cmp (normalized_sub_tunnel_weights_t *a, + normalized_sub_tunnel_weights_t *b) +{ + int cmp = a->weight - b->weight; + return (cmp == 0 + ? a->sub_tunnel_index - b->sub_tunnel_index + : (cmp > 0 ? -1 : 1)); +} + +/** Computes sub tunnel load balancing vector. + * Algorithm is identical to that used for building unequal-cost multipath + * adjacencies */ +static void +compute_sub_tunnels_balancing_vector (lisp_gpe_tunnel_t * t) +{ + uword n_sts, i, n_nsts, n_nsts_left; + f64 sum_weight, norm, error, tolerance; + normalized_sub_tunnel_weights_t * nsts = 0, * stp; + lisp_gpe_sub_tunnel_t * sts = t->sub_tunnels; + u32 * st_lbv = 0; + + /* Accept 1% error */ + tolerance = .01; + + n_sts = vec_len (sts); + vec_validate(nsts, 2 * n_sts - 1); + + sum_weight = 0; + for (i = 0; i < n_sts; i++) + { + /* Find total weight to normalize weights. */ + sum_weight += sts[i].weight; + + /* build normalized sub tunnels vector */ + nsts[i].weight = sts[i].weight; + nsts[i].sub_tunnel_index = i; + } + + n_nsts = n_sts; + if (n_sts == 1) + { + nsts[0].weight = 1; + _vec_len(nsts) = 1; + goto build_lbv; + } + + /* Sort sub-tunnels by weight */ + qsort (nsts, n_nsts, sizeof(u32), (void * )weight_cmp); + + /* Save copies of all next hop weights to avoid being overwritten in loop below. */ + for (i = 0; i < n_nsts; i++) + nsts[n_nsts + i].weight = nsts[i].weight; + + /* Try larger and larger power of 2 sized blocks until we + find one where traffic flows to within 1% of specified weights. */ + for (n_nsts = max_pow2 (n_sts); ; n_nsts *= 2) + { + error = 0; + + norm = n_nsts / sum_weight; + n_nsts_left = n_nsts; + for (i = 0; i < n_sts; i++) + { + f64 nf = nsts[n_sts + i].weight * norm; + word n = flt_round_nearest (nf); + + n = n > n_nsts_left ? n_nsts_left : n; + n_nsts_left -= n; + error += fabs (nf - n); + nsts[i].weight = n; + } + + nsts[0].weight += n_nsts_left; + + /* Less than 5% average error per adjacency with this size adjacency block? */ + if (error <= tolerance * n_nsts) + { + /* Truncate any next hops with zero weight. */ + _vec_len (nsts) = i; + break; + } + } + + build_lbv: + + /* build load balancing vector */ + vec_foreach (stp, nsts) + { + for (i = 0; i < stp[0].weight; i++) + vec_add1(st_lbv, stp[0].sub_tunnel_index); + } + + t->sub_tunnels_lbv = st_lbv; + t->sub_tunnels_lbv_count = n_nsts; + t->norm_sub_tunnel_weights = nsts; +} + +static void +create_sub_tunnels (lisp_gpe_main_t * lgm, lisp_gpe_tunnel_t * t) +{ + lisp_gpe_sub_tunnel_t st; + locator_pair_t * lp = 0; + int i; + + /* create sub-tunnels for all locator pairs */ + for (i = 0; i < vec_len(t->locator_pairs); i++) + { + lp = &t->locator_pairs[i]; + st.locator_pair_index = i; + st.parent_index = t - lgm->tunnels; + st.weight = lp->weight; + + /* compute rewrite for sub-tunnel */ + lisp_gpe_rewrite (t, &st, lp); + vec_add1(t->sub_tunnels, st); + } + + /* normalize weights and compute sub-tunnel load balancing vector */ + compute_sub_tunnels_balancing_vector(t); +} + #define foreach_copy_field \ _(encap_fib_index) \ _(decap_fib_index) \ @@ -105,8 +230,8 @@ add_del_ip_tunnel (vnet_lisp_gpe_add_del_fwd_entry_args_t *a, u8 is_l2, lisp_gpe_main_t * lgm = &lisp_gpe_main; lisp_gpe_tunnel_t *t = 0; lisp_gpe_tunnel_key_t key; + lisp_gpe_sub_tunnel_t * stp = 0; uword * p; - int rv; /* prepare tunnel key */ memset(&key, 0, sizeof(key)); @@ -138,12 +263,7 @@ add_del_ip_tunnel (vnet_lisp_gpe_add_del_fwd_entry_args_t *a, u8 is_l2, foreach_copy_field; #undef _ - /* TODO multihoming */ - if (!a->is_negative) - { - ip_address_copy (&t->src, &a->locator_pairs[0].lcl_loc); - ip_address_copy (&t->dst, &a->locator_pairs[0].rmt_loc); - } + t->locator_pairs = vec_dup(a->locator_pairs); /* if vni is non-default */ if (a->vni) @@ -159,14 +279,9 @@ add_del_ip_tunnel (vnet_lisp_gpe_add_del_fwd_entry_args_t *a, u8 is_l2, else t->next_protocol = LISP_GPE_NEXT_PROTO_ETHERNET; - /* compute rewrite */ - rv = lisp_gpe_rewrite (t); - - if (rv) - { - pool_put(lgm->tunnels, t); - return rv; - } + /* build sub-tunnels for lowest priority locator-pairs */ + if (!a->is_negative) + create_sub_tunnels (lgm, t); mhash_set(&lgm->lisp_gpe_tunnel_by_key, &key, t - lgm->tunnels, 0); @@ -188,7 +303,13 @@ add_del_ip_tunnel (vnet_lisp_gpe_add_del_fwd_entry_args_t *a, u8 is_l2, mhash_unset(&lgm->lisp_gpe_tunnel_by_key, &key, 0); - vec_free(t->rewrite); + vec_foreach(stp, t->sub_tunnels) + { + vec_free(stp->rewrite); + } + vec_free(t->sub_tunnels); + vec_free(t->sub_tunnels_lbv); + vec_free(t->locator_pairs); pool_put(lgm->tunnels, t); } @@ -197,8 +318,8 @@ add_del_ip_tunnel (vnet_lisp_gpe_add_del_fwd_entry_args_t *a, u8 is_l2, static int build_ip_adjacency (lisp_gpe_main_t * lgm, ip_adjacency_t * adj, u32 table_id, - u32 vni, u32 tun_index, u8 is_negative, u8 action, - u8 ip_ver) + u32 vni, u32 tun_index, u32 n_sub_tun, u8 is_negative, + u8 action, u8 ip_ver) { uword * lookup_next_index, * lgpe_sw_if_index, * lnip; @@ -222,14 +343,15 @@ build_ip_adjacency (lisp_gpe_main_t * lgm, ip_adjacency_t * adj, u32 table_id, /* the assumption is that the interface must've been created before * programming the dp */ - ASSERT(lookup_next_index != 0); - ASSERT(lgpe_sw_if_index != 0); + ASSERT(lookup_next_index != 0 && lgpe_sw_if_index != 0); - /* hijack explicit fib index to store lisp interface node index and - * if_address_index for the tunnel index */ + /* hijack explicit fib index to store lisp interface node index, + * if_address_index for the tunnel index and saved lookup next index + * for the number of sub tunnels */ adj->explicit_fib_index = lookup_next_index[0]; adj->if_address_index = tun_index; adj->rewrite_header.sw_if_index = lgpe_sw_if_index[0]; + adj->saved_lookup_next_index = n_sub_tun; } /* negative mapping */ else @@ -268,7 +390,8 @@ add_del_ip_fwd_entry (lisp_gpe_main_t * lgm, vnet_lisp_gpe_add_del_fwd_entry_args_t * a) { ip_adjacency_t adj, * adjp; - u32 rv, tun_index = ~0; + lisp_gpe_tunnel_t * t; + u32 rv, tun_index = ~0, n_sub_tuns = 0; ip_prefix_t * rmt_pref, * lcl_pref; u8 ip_ver; @@ -285,11 +408,17 @@ add_del_ip_fwd_entry (lisp_gpe_main_t * lgm, clib_warning ("failed to build tunnel!"); return rv; } + if (a->is_add) + { + t = pool_elt_at_index(lgm->tunnels, tun_index); + n_sub_tuns = t->sub_tunnels_lbv_count; + } } /* setup adjacency for eid */ rv = build_ip_adjacency (lgm, &adj, a->table_id, a->vni, tun_index, - a->is_negative, a->action, ip_ver); + n_sub_tuns, a->is_negative, a->action, + ip_ver); /* add/delete route for eid */ rv |= ip_sd_fib_add_del_route (lgm, rmt_pref, lcl_pref, a->table_id, &adj, @@ -321,8 +450,8 @@ static void make_mac_fib_key (BVT(clib_bihash_kv) *kv, u16 bd_index, u8 src_mac[6], u8 dst_mac[6]) { - kv->key[0] = (((u64) bd_index) << 48) | (((u64 *)dst_mac)[0] & MAC_BIT_MASK); - kv->key[1] = ((u64 *)src_mac)[0] & MAC_BIT_MASK; + kv->key[0] = (((u64) bd_index) << 48) | mac_to_u64(dst_mac); + kv->key[1] = mac_to_u64(src_mac); kv->key[2] = 0; } @@ -593,18 +722,15 @@ u8 * format_lisp_gpe_tunnel (u8 * s, va_list * args) { lisp_gpe_tunnel_t * t = va_arg (*args, lisp_gpe_tunnel_t *); - lisp_gpe_main_t * lgm = &lisp_gpe_main; + lisp_gpe_main_t * lgm = vnet_lisp_gpe_get_main(); + locator_pair_t * lp = 0; + normalized_sub_tunnel_weights_t * nstw; - s = format (s, - "[%d] %U (src) %U (dst) fibs: encap %d, decap %d", - t - lgm->tunnels, - format_ip_address, &t->src, - format_ip_address, &t->dst, - t->encap_fib_index, - t->decap_fib_index); - - s = format (s, " decap next %U\n", format_decap_next, t->decap_next_index); - s = format (s, "lisp ver %d ", (t->ver_res>>6)); + s = format (s, "tunnel %d vni %d (0x%x)\n", t - lgm->tunnels, t->vni, t->vni); + s = format (s, " fibs: encap %d, decap %d decap next %U\n", + t->encap_fib_index, t->decap_fib_index, format_decap_next, + t->decap_next_index); + s = format (s, " lisp ver %d ", (t->ver_res>>6)); #define _(n,v) if (t->flags & v) s = format (s, "%s-bit ", #n); foreach_lisp_gpe_flag_bit; @@ -613,7 +739,21 @@ format_lisp_gpe_tunnel (u8 * s, va_list * args) s = format (s, "next_protocol %d ver_res %x res %x\n", t->next_protocol, t->ver_res, t->res); - s = format (s, "iid %d (0x%x)\n", t->vni, t->vni); + s = format (s, " locator-pairs:\n"); + vec_foreach(lp, t->locator_pairs) + { + s = format (s, " local: %U remote: %U weight %d\n", + format_ip_address, &lp->lcl_loc, format_ip_address, + &lp->rmt_loc, lp->weight); + } + + s = format (s, " active sub-tunnels:\n"); + vec_foreach(nstw, t->norm_sub_tunnel_weights) + { + lp = vec_elt_at_index(t->locator_pairs, nstw->sub_tunnel_index); + s = format (s, " local: %U remote: %U weight %d\n", format_ip_address, + &lp->lcl_loc, format_ip_address, &lp->rmt_loc, nstw->weight); + } return s; } diff --git a/vnet/vnet/lisp-gpe/lisp_gpe.h b/vnet/vnet/lisp-gpe/lisp_gpe.h index f3e75772ee7..145b5d493e5 100644 --- a/vnet/vnet/lisp-gpe/lisp_gpe.h +++ b/vnet/vnet/lisp-gpe/lisp_gpe.h @@ -54,17 +54,45 @@ typedef struct }; } lisp_gpe_tunnel_key_t; -typedef struct +typedef struct lisp_gpe_sub_tunnel { /* Rewrite string. $$$$ embed vnet_rewrite header */ u8 * rewrite; + u32 parent_index; + u32 locator_pair_index; + u8 weight; + u8 is_ip4; +} lisp_gpe_sub_tunnel_t; + +typedef struct nomalized_sub_tunnel +{ + u32 sub_tunnel_index; + u8 weight; +} normalized_sub_tunnel_weights_t; + +typedef struct +{ + /* tunnel src and dst addresses */ + locator_pair_t * locator_pairs; + + /* locator-pairs with best priority become sub-tunnels */ + lisp_gpe_sub_tunnel_t * sub_tunnels; + + /* sub-tunnels load balancing vector: contains list of sub-tunnel + * indexes replicated according to weight */ + u32 * sub_tunnels_lbv; + + /* number of entries in load balancing vector */ + u32 sub_tunnels_lbv_count; + + /* normalized sub tunnel weights */ + normalized_sub_tunnel_weights_t * norm_sub_tunnel_weights; /* decap next index */ u32 decap_next_index; - /* tunnel src and dst addresses */ - ip_address_t src; - ip_address_t dst; + /* TODO remove */ + ip_address_t src, dst; /* FIB indices */ u32 encap_fib_index; /* tunnel partner lookup here */ -- cgit 1.2.3-korg