/* * Copyright (c) 2016 Cisco and/or its affiliates. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include //GC runs at most once every so many seconds #define LB_GARBAGE_RUN 60 //After so many seconds. It is assumed that inter-core race condition will not occur. #define LB_CONCURRENCY_TIMEOUT 10 lb_main_t lb_main; #define lb_get_writer_lock() clib_spinlock_lock (&lb_main.writer_lock) #define lb_put_writer_lock() clib_spinlock_unlock (&lb_main.writer_lock) static void lb_as_stack (lb_as_t *as); const static char * const lb_dpo_gre4_ip4[] = { "lb4-gre4" , NULL }; const static char * const lb_dpo_gre4_ip6[] = { "lb6-gre4" , NULL }; const static char* const * const lb_dpo_gre4_nodes[DPO_PROTO_NUM] = { [DPO_PROTO_IP4] = lb_dpo_gre4_ip4, [DPO_PROTO_IP6] = lb_dpo_gre4_ip6, }; const static char * const lb_dpo_gre6_ip4[] = { "lb4-gre6" , NULL }; const static char * const lb_dpo_gre6_ip6[] = { "lb6-gre6" , NULL }; const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] = { [DPO_PROTO_IP4] = lb_dpo_gre6_ip4, [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, }; const static char * const lb_dpo_gre4_ip4_port[] = { "lb4-gre4-port" , NULL }; const static char * const lb_dpo_gre4_ip6_port[] = { "lb6-gre4-port" , NULL }; const static char* const * const lb_dpo_gre4_port_nodes[DPO_PROTO_NUM] = { [DPO_PROTO_IP4] = lb_dpo_gre4_ip4_port, [DPO_PROTO_IP6] = lb_dpo_gre4_ip6_port, }; const static char * const lb_dpo_gre6_ip4_port[] = { "lb4-gre6-port" , NULL }; const static char * const lb_dpo_gre6_ip6_port[] = { "lb6-gre6-port" , NULL }; const static char* const * const lb_dpo_gre6_port_nodes[DPO_PROTO_NUM] = { [DPO_PROTO_IP4] = lb_dpo_gre6_ip4_port, [DPO_PROTO_IP6] = lb_dpo_gre6_ip6_port, }; const static char * const lb_dpo_l3dsr_ip4[] = {"lb4-l3dsr" , NULL}; const static char* const * const lb_dpo_l3dsr_nodes[DPO_PROTO_NUM] = { [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4, }; const static char * const lb_dpo_l3dsr_ip4_port[] = {"lb4-l3dsr-port" , NULL}; const static char* const * const lb_dpo_l3dsr_port_nodes[DPO_PROTO_NUM] = { [DPO_PROTO_IP4] = lb_dpo_l3dsr_ip4_port, }; const static char * const lb_dpo_nat4_ip4_port[] = { "lb4-nat4-port" , NULL }; const static char* const * const lb_dpo_nat4_port_nodes[DPO_PROTO_NUM] = { [DPO_PROTO_IP4] = lb_dpo_nat4_ip4_port, }; const static char * const lb_dpo_nat6_ip6_port[] = { "lb6-nat6-port" , NULL }; const static char* const * const lb_dpo_nat6_port_nodes[DPO_PROTO_NUM] = { [DPO_PROTO_IP6] = lb_dpo_nat6_ip6_port, }; u32 lb_hash_time_now(vlib_main_t * vm) { return (u32) (vlib_time_now(vm) + 10000); } u8 *format_lb_main (u8 * s, va_list * args) { vlib_thread_main_t *tm = vlib_get_thread_main(); lb_main_t *lbm = &lb_main; s = format(s, "lb_main"); s = format(s, " ip4-src-address: %U \n", format_ip4_address, &lbm->ip4_src_address); s = format(s, " ip6-src-address: %U \n", format_ip6_address, &lbm->ip6_src_address); s = format(s, " #vips: %u\n", pool_elts(lbm->vips)); s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1); u32 thread_index; for(thread_index = 0; thread_index < tm->n_vlib_mains; thread_index++ ) { lb_hash_t *h = lbm->per_cpu[thread_index].sticky_ht; if (h) { s = format(s, "core %d\n", thread_index); s = format(s, " timeout: %ds\n", h->timeout); s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h)); } } return s; } static char *lb_vip_type_strings[] = { [LB_VIP_TYPE_IP6_GRE6] = "ip6-gre6", [LB_VIP_TYPE_IP6_GRE4] = "ip6-gre4", [LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6", [LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4", [LB_VIP_TYPE_IP4_L3DSR] = "ip4-l3dsr", [LB_VIP_TYPE_IP4_NAT4] = "ip4-nat4", [LB_VIP_TYPE_IP6_NAT6] = "ip6-nat6", }; u8 *format_lb_vip_type (u8 * s, va_list * args) { lb_vip_type_t vipt = va_arg (*args, lb_vip_type_t); u32 i; for (i=0; itype, format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY, vip->new_flow_table_mask + 1, pool_elts(vip->as_indexes), (vip->flags & LB_VIP_FLAGS_USED)?"":" removed"); if (vip->port != 0) { s = format(s, " protocol:%u port:%u ", vip->protocol, vip->port); } if (vip->type == LB_VIP_TYPE_IP4_L3DSR) { s = format(s, " dscp:%u", vip->encap_args.dscp); } else if ((vip->type == LB_VIP_TYPE_IP4_NAT4) || (vip->type == LB_VIP_TYPE_IP6_NAT6)) { s = format (s, " type:%s port:%u target_port:%u", (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)?"clusterip": "nodeport", ntohs(vip->port), ntohs(vip->encap_args.target_port)); } return s; } u8 *format_lb_as (u8 * s, va_list * args) { lb_as_t *as = va_arg (*args, lb_as_t *); return format(s, "%U %s", format_ip46_address, &as->address, IP46_TYPE_ANY, (as->flags & LB_AS_FLAGS_USED)?"used":"removed"); } u8 *format_lb_vip_detailed (u8 * s, va_list * args) { lb_main_t *lbm = &lb_main; lb_vip_t *vip = va_arg (*args, lb_vip_t *); u32 indent = format_get_indent (s); s = format(s, "%U %U [%lu] %U%s\n" "%U new_size:%u\n", format_white_space, indent, format_lb_vip_type, vip->type, vip - lbm->vips, format_ip46_prefix, &vip->prefix, (u32) vip->plen, IP46_TYPE_ANY, (vip->flags & LB_VIP_FLAGS_USED)?"":" removed", format_white_space, indent, vip->new_flow_table_mask + 1); if (vip->port != 0) { s = format(s, "%U protocol:%u port:%u\n", format_white_space, indent, vip->protocol, vip->port); } if (vip->type == LB_VIP_TYPE_IP4_L3DSR) { s = format(s, "%U dscp:%u\n", format_white_space, indent, vip->encap_args.dscp); } else if ((vip->type == LB_VIP_TYPE_IP4_NAT4) || (vip->type == LB_VIP_TYPE_IP6_NAT6)) { s = format (s, "%U type:%s port:%u target_port:%u", format_white_space, indent, (vip->encap_args.srv_type == LB_SRV_TYPE_CLUSTERIP)?"clusterip": "nodeport", ntohs(vip->port), ntohs(vip->encap_args.target_port)); } //Print counters s = format(s, "%U counters:\n", format_white_space, indent); u32 i; for (i=0; ivip_counters[i].name, vlib_get_simple_counter(&lbm->vip_counters[i], vip - lbm->vips)); s = format(s, "%U #as:%u\n", format_white_space, indent, pool_elts(vip->as_indexes)); //Let's count the buckets for each AS u32 *count = 0; vec_validate(count, pool_len(lbm->ass)); //Possibly big alloc for not much... lb_new_flow_entry_t *nfe; vec_foreach(nfe, vip->new_flow_table) count[nfe->as_index]++; lb_as_t *as; u32 *as_index; pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; s = format(s, "%U %U %u buckets %Lu flows dpo:%u %s\n", format_white_space, indent, format_ip46_address, &as->address, IP46_TYPE_ANY, count[as - lbm->ass], vlib_refcount_get(&lbm->as_refcount, as - lbm->ass), as->dpo.dpoi_index, (as->flags & LB_AS_FLAGS_USED)?"used":" removed"); }); vec_free(count); return s; } typedef struct { u32 as_index; u32 last; u32 skip; } lb_pseudorand_t; static int lb_pseudorand_compare(void *a, void *b) { lb_as_t *asa, *asb; lb_main_t *lbm = &lb_main; asa = &lbm->ass[((lb_pseudorand_t *)a)->as_index]; asb = &lbm->ass[((lb_pseudorand_t *)b)->as_index]; return memcmp(&asa->address, &asb->address, sizeof(asb->address)); } static void lb_vip_garbage_collection(lb_vip_t *vip) { lb_main_t *lbm = &lb_main; lb_snat4_key_t m_key4; clib_bihash_kv_8_8_t kv4, value4; lb_snat6_key_t m_key6; clib_bihash_kv_24_8_t kv6, value6; lb_snat_mapping_t *m = 0; CLIB_SPINLOCK_ASSERT_LOCKED (&lbm->writer_lock); u32 now = (u32) vlib_time_now(vlib_get_main()); if (!clib_u32_loop_gt(now, vip->last_garbage_collection + LB_GARBAGE_RUN)) return; vip->last_garbage_collection = now; lb_as_t *as; u32 *as_index; pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; if (!(as->flags & LB_AS_FLAGS_USED) && //Not used clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) { //Not referenced if (lb_vip_is_nat4_port(vip)) { m_key4.addr = as->address.ip4; m_key4.port = vip->encap_args.target_port; m_key4.protocol = 0; m_key4.fib_index = 0; kv4.key = m_key4.as_u64; if(!clib_bihash_search_8_8(&lbm->mapping_by_as4, &kv4, &value4)) m = pool_elt_at_index (lbm->snat_mappings, value4.value); ASSERT (m); kv4.value = m - lbm->snat_mappings; clib_bihash_add_del_8_8(&lbm->mapping_by_as4, &kv4, 0); pool_put (lbm->snat_mappings, m); } else if (lb_vip_is_nat6_port(vip)) { m_key6.addr.as_u64[0] = as->address.ip6.as_u64[0]; m_key6.addr.as_u64[1] = as->address.ip6.as_u64[1]; m_key6.port = vip->encap_args.target_port; m_key6.protocol = 0; m_key6.fib_index = 0; kv6.key[0] = m_key6.as_u64[0]; kv6.key[1] = m_key6.as_u64[1]; kv6.key[2] = m_key6.as_u64[2]; if (!clib_bihash_search_24_8 (&lbm->mapping_by_as6, &kv6, &value6)) m = pool_elt_at_index (lbm->snat_mappings, value6.value); ASSERT (m); kv6.value = m - lbm->snat_mappings; clib_bihash_add_del_24_8(&lbm->mapping_by_as6, &kv6, 0); pool_put (lbm->snat_mappings, m); } fib_entry_child_remove(as->next_hop_fib_entry_index, as->next_hop_child_index); fib_table_entry_delete_index(as->next_hop_fib_entry_index, FIB_SOURCE_RR); as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID; pool_put(vip->as_indexes, as_index); pool_put(lbm->ass, as); } }); } void lb_garbage_collection() { lb_main_t *lbm = &lb_main; lb_get_writer_lock(); lb_vip_t *vip; u32 *to_be_removed_vips = 0, *i; pool_foreach(vip, lbm->vips, { lb_vip_garbage_collection(vip); if (!(vip->flags & LB_VIP_FLAGS_USED) && (pool_elts(vip->as_indexes) == 0)) { vec_add1(to_be_removed_vips, vip - lbm->vips); } }); vec_foreach(i, to_be_removed_vips) { vip = &lbm->vips[*i]; pool_put(lbm->vips, vip); pool_free(vip->as_indexes); } vec_free(to_be_removed_vips); lb_put_writer_lock(); } static void lb_vip_update_new_flow_table(lb_vip_t *vip) { lb_main_t *lbm = &lb_main; lb_new_flow_entry_t *old_table; u32 i, *as_index; lb_new_flow_entry_t *new_flow_table = 0; lb_as_t *as; lb_pseudorand_t *pr, *sort_arr = 0; CLIB_SPINLOCK_ASSERT_LOCKED (&lbm->writer_lock); // We must have the lock //Check if some AS is configured or not i = 0; pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; if (as->flags & LB_AS_FLAGS_USED) { //Not used anymore i = 1; goto out; //Not sure 'break' works in this macro-loop } }); out: if (i == 0) { //Only the default. i.e. no AS vec_validate(new_flow_table, vip->new_flow_table_mask); for (i=0; ias_indexes)); i = 0; pool_foreach(as_index, vip->as_indexes, { as = &lbm->ass[*as_index]; if (!(as->flags & LB_AS_FLAGS_USED)) //Not used anymore continue; sort_arr[i].as_index = as - lbm->ass; i++; }); _vec_len(sort_arr) = i; vec_sort_with_function(sort_arr, lb_pseudorand_compare); //Now let's pseudo-randomly generate permutations vec_foreach(pr, sort_arr) { lb_as_t *as = &lbm->ass[pr->as_index]; u64 seed = clib_xxhash(as->address.as_u64[0] ^ as->address.as_u64[1]); /* We have 2^n buckets. * skip must be prime with 2^n. * So skip must be odd. * MagLev actually state that M should be prime, * but this has a big computation cost (% operation). * Using 2^n is more better (& operation). */ pr->skip = ((seed & 0xffffffff) | 1) & vip->new_flow_table_mask; pr->last = (seed >> 32) & vip->new_flow_table_mask; } //Let's create a new flow table vec_validate(new_flow_table, vip->new_flow_table_mask); for (i=0; ilast; pr->last = (pr->last + pr->skip) & vip->new_flow_table_mask; if (new_flow_table[last].as_index == 0) { new_flow_table[last].as_index = pr->as_index; break; } } done++; if (done == vec_len(new_flow_table)) goto finished; } } finished: vec_free(sort_arr); old_table = vip->new_flow_table; vip->new_flow_table = new_flow_table; vec_free(old_table); } int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, u32 per_cpu_sticky_buckets, u32 flow_timeout) { lb_main_t *lbm = &lb_main; if (!is_pow2(per_cpu_sticky_buckets)) return VNET_API_ERROR_INVALID_MEMORY_SIZE; lb_get_writer_lock(); //Not exactly necessary but just a reminder that it exists for my future self lbm->ip4_src_address = *ip4_address; lbm->ip6_src_address = *ip6_address; lbm->per_cpu_sticky_buckets = per_cpu_sticky_buckets; lbm->flow_timeout = flow_timeout; lb_put_writer_lock(); return 0; } static int lb_vip_port_find_index(ip46_address_t *prefix, u8 plen, u8 protocol, u16 port, lb_lkp_type_t lkp_type, u32 *vip_index) { lb_main_t *lbm = &lb_main; lb_vip_t *vip; /* This must be called with the lock owned */ CLIB_SPINLOCK_ASSERT_LOCKED (&lbm->writer_lock); ip46_prefix_normalize(prefix, plen); pool_foreach(vip, lbm->vips, { if ((vip->flags & LB_AS_FLAGS_USED) && vip->plen == plen && vip->prefix.as_u64[0] == prefix->as_u64[0] && vip->prefix.as_u64[1] == prefix->as_u64[1]) { if((lkp_type == LB_LKP_SAME_IP_PORT && vip->protocol == protocol && vip->port == port) || (lkp_type == LB_LKP_ALL_PORT_IP && vip->port == 0) || (lkp_type == LB_LKP_DIFF_IP_PORT && (vip->protocol != protocol || vip->port != port) ) ) { *vip_index = vip - lbm->vips; return 0; } } }); return VNET_API_ERROR_NO_SUCH_ENTRY; } static int lb_vip_port_find_index_with_lock(ip46_address_t *prefix, u8 plen, u8 protocol, u16 port, u32 *vip_index) { return lb_vip_port_find_index(prefix, plen, protocol, port, LB_LKP_SAME_IP_PORT, vip_index); } static int lb_vip_port_find_all_port_vip(ip46_address_t *prefix, u8 plen, u32 *vip_index) { return lb_vip_port_find_index(prefix, plen, ~0, 0, LB_LKP_ALL_PORT_IP, vip_index); } /* Find out per-port-vip entry with different protocol and port */ static int lb_vip_port_find_diff_port(ip46_address_t *prefix, u8 plen, u8 protocol, u16 port, u32 *vip_index) { return lb_vip_port_find_index(prefix, plen, protocol, port, LB_LKP_DIFF_IP_PORT, vip_index); } int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u8 protocol, u16 port, u32 *vip_index) { int ret; lb_get_writer_lock(); ret = lb_vip_port_find_index_with_lock(prefix, plen, protocol, port, vip_index); lb_put_writer_lock(); return ret; } static int lb_as_find_index_vip(lb_vip_t *vip, ip46_address_t *address, u32 *as_index) { lb_main_t *lbm = &lb_main; /* This must be called with the lock owned */ CLIB_SPINLOCK_ASSERT_LOCKED (&lbm->writer_lock); lb_as_t *as; u32 *asi; pool_foreach(asi, vip->as_indexes, { as = &lbm->ass[*asi]; if (as->vip_index == (vip - lbm->vips) && as->address.as_u64[0] == address->as_u64[0] && as->address.as_u64[1] == address->as_u64[1]) { *as_index = as - lbm->ass; return 0; } }); return -1; } int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n) { lb_main_t *lbm = &lb_main; lb_get_writer_lock(); lb_vip_t *vip; if (!(vip = lb_vip_get_by_index(vip_index))) { lb_put_writer_lock(); return VNET_API_ERROR_NO_SUCH_ENTRY; } ip46_type_t type = lb_encap_is_ip4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6; u32 *to_be_added = 0; u32 *to_be_updated = 0; u32 i; u32 *ip; lb_snat_mapping_t *m; //Sanity check while (n--) { if (!lb_as_find_index_vip(vip, &addresses[n], &i)) { if (lbm->ass[i].flags & LB_AS_FLAGS_USED) { vec_free(to_be_added); vec_free(to_be_updated); lb_put_writer_lock(); return VNET_API_ERROR_VALUE_EXIST; } vec_add1(to_be_updated, i); goto next; } if (ip46_address_type(&addresses[n]) != type) { vec_free(to_be_added); vec_free(to_be_updated); lb_put_writer_lock(); return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; } if (n)
/*
 * Copyright (c) 2017 Cisco and/or its affiliates.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <stn/stn.h>

#include <vnet/plugin/plugin.h>
#include <vpp/app/version.h>
#include <vnet/ip/format.h>
#include <vnet/ethernet/packet.h>
#include <vnet/udp/udp.h>
#include <vnet/tcp/tcp.h>

stn_main_t stn_main;
static vlib_node_registration_t stn_ip4_punt;
static vlib_node_registration_t stn_ip6_punt;

static u8 stn_hw_addr_local[6] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x01};
static u8 stn_hw_addr_dst[6] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x02};

static ethernet_header_t stn_ip4_ethernet_header = {};
static ethernet_header_t stn_ip6_ethernet_header = {};

typedef struct {
  clib_bihash_kv_16_8_t kv;
} stn_ip46_punt_trace_t;

static u8 *
format_stn_rule (u8 * s, va_list * args)
{
  stn_rule_t *r = va_arg (*args, stn_rule_t *);
  stn_main_t *stn = &stn_main;
  u32 indent = format_get_indent (s);
  u32 node_index = ip46_address_is_ip4(&r->address)?stn_ip4_punt.index:stn_ip6_punt.index;
  vlib_node_t *next_node = vlib_get_next_node(vlib_get_main(), node_index, r->next_node_index);
  s = format (s, "rule_index: %d\n", r - stn->rules);
  s = format (s, "%Uaddress: %U\n", format_white_space, indent,
	      format_ip46_address, &r->address, IP46_TYPE_ANY);
  s = format (s, "%Uiface: %U (%d)\n", format_white_space, indent,
  	      format_vnet_sw_if_index_name, vnet_get_main(), r->sw_if_index,
	      r->sw_if_index);
  s = format (s, "%Unext_node: %s (%d)", format_white_space, indent,
	      next_node->name, next_node->index);
  return s;
}

static_always_inline u8 *
format_stn_ip46_punt_trace (u8 * s, va_list * args, u8 is_ipv4)
{
  CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *);
  CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *);
  stn_ip46_punt_trace_t *t = va_arg (*args, stn_ip46_punt_trace_t *);
  u32 indent = format_get_indent (s);

  s = format (s, "dst_address: %U\n", format_ip46_address,
	  (ip46_address_t *)t->kv.key, IP46_TYPE_ANY);

  if (t->kv.value == ~(0L))
    {
      s = format (s, "%Urule: none", format_white_space, indent);
    }
  else
    {
      s = format (s, "%Urule:\n%U%U", format_white_space, indent,
		     format_white_space, indent + 2,
		     format_stn_rule, &stn_main.rules[t->kv.value]);
    }
  return s;
}

typedef enum
{
  STN_IP_PUNT_DROP,
  STN_IP_PUNT_N_NEXT,
} stn_ip_punt_next_t;

static_always_inline uword
stn_ip46_punt_fn (vlib_main_t * vm,
                  vlib_node_runtime_t * node,
                  vlib_frame_t * frame,
                  u8 is_ipv4)
{
  u32 n_left_from, *from, next_index, *to_next, n_left_to_next;
  stn_main_t *stn = &stn_main;

  from = vlib_frame_vector_args (frame);
  n_left_from = frame->n_vectors