diff options
author | Damjan Marion <damarion@cisco.com> | 2016-12-28 18:38:59 +0100 |
---|---|---|
committer | Damjan Marion <damarion@cisco.com> | 2017-01-01 18:11:43 +0100 |
commit | cb034b9b374927c7552e36dcbc306d8456b2a0cb (patch) | |
tree | 9ff64f9792560630c8cf8faa2f74fc20671c30f1 /src/plugins/lb | |
parent | fdc62abdc113ea63dc867375bd49ef3043dcd290 (diff) |
Move java,lua api and remaining plugins to src/
Change-Id: I1c3b87e886603678368428ae56a6bd3327cbc90d
Signed-off-by: Damjan Marion <damarion@cisco.com>
Diffstat (limited to 'src/plugins/lb')
-rw-r--r-- | src/plugins/lb/api.c | 228 | ||||
-rw-r--r-- | src/plugins/lb/cli.c | 250 | ||||
-rw-r--r-- | src/plugins/lb/lb.api | 71 | ||||
-rw-r--r-- | src/plugins/lb/lb.c | 844 | ||||
-rw-r--r-- | src/plugins/lb/lb.h | 333 | ||||
-rw-r--r-- | src/plugins/lb/lb_plugin_doc.md | 141 | ||||
-rw-r--r-- | src/plugins/lb/lb_test.c | 293 | ||||
-rw-r--r-- | src/plugins/lb/lbhash.h | 216 | ||||
-rw-r--r-- | src/plugins/lb/node.c | 419 | ||||
-rw-r--r-- | src/plugins/lb/refcount.c | 41 | ||||
-rw-r--r-- | src/plugins/lb/refcount.h | 67 | ||||
-rw-r--r-- | src/plugins/lb/util.c | 72 | ||||
-rw-r--r-- | src/plugins/lb/util.h | 40 |
13 files changed, 3015 insertions, 0 deletions
diff --git a/src/plugins/lb/api.c b/src/plugins/lb/api.c new file mode 100644 index 00000000000..06c53fa1005 --- /dev/null +++ b/src/plugins/lb/api.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/lb.h> + +#include <vppinfra/byte_order.h> +#include <vlibapi/api.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> + +#define vl_msg_id(n,h) n, +typedef enum { +#include <lb/lb.api.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + + +/* define message structures */ +#define vl_typedefs +#include <lb/lb.api.h> +#undef vl_typedefs + +/* define generated endian-swappers */ +#define vl_endianfun +#include <lb/lb.api.h> +#undef vl_endianfun + +#define vl_print(handle, ...) vlib_cli_output (handle, __VA_ARGS__) + +/* Get the API version number */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <lb/lb.api.h> +#undef vl_api_version + +#define vl_msg_name_crc_list +#include <lb/lb.api.h> +#undef vl_msg_name_crc_list + +static void +setup_message_id_table (lb_main_t * lbm, api_main_t * am) +{ +#define _(id,n,crc) \ + vl_msg_api_add_msg_name_crc (am, #n "_" #crc, id + lbm->msg_id_base); + foreach_vl_msg_name_crc_lb; +#undef _ +} + +/* Macro to finish up custom dump fns */ +#define FINISH \ + vec_add1 (s, 0); \ + vl_print (handle, (char *)s); \ + vec_free (s); \ + return handle; + +/* + * A handy macro to set up a message reply. + * Assumes that the following variables are available: + * mp - pointer to request message + * rmp - pointer to reply message type + * rv - return value + */ + +#define REPLY_MACRO(t) \ +do { \ + unix_shared_memory_queue_t * q = \ + vl_api_client_index_to_input_queue (mp->client_index); \ + if (!q) \ + return; \ + \ + rmp = vl_msg_api_alloc (sizeof (*rmp)); \ + rmp->_vl_msg_id = ntohs((t)+lbm->msg_id_base); \ + rmp->context = mp->context; \ + rmp->retval = ntohl(rv); \ + \ + vl_msg_api_send_shmem (q, (u8 *)&rmp); \ +} while(0); + +static void +vl_api_lb_conf_t_handler +(vl_api_lb_conf_t * mp) +{ + lb_main_t *lbm = &lb_main; + vl_api_lb_conf_reply_t * rmp; + int rv = 0; + + rv = lb_conf((ip4_address_t *)&mp->ip4_src_address, + (ip6_address_t *)mp->ip6_src_address, + mp->sticky_buckets_per_core, + mp->flow_timeout); + + REPLY_MACRO (VL_API_LB_CONF_REPLY); +} + +static void *vl_api_lb_conf_t_print +(vl_api_lb_conf_t *mp, void * handle) +{ + u8 * s; + s = format (0, "SCRIPT: lb_conf "); + s = format (s, "%U ", format_ip4_address, (ip4_address_t *)&mp->ip4_src_address); + s = format (s, "%U ", format_ip6_address, (ip6_address_t *)mp->ip6_src_address); + s = format (s, "%u ", mp->sticky_buckets_per_core); + s = format (s, "%u ", mp->flow_timeout); + FINISH; +} + + +static void +vl_api_lb_add_del_vip_t_handler +(vl_api_lb_add_del_vip_t * mp) +{ + lb_main_t *lbm = &lb_main; + vl_api_lb_conf_reply_t * rmp; + int rv = 0; + ip46_address_t prefix; + memcpy(&prefix.ip6, mp->ip_prefix, sizeof(prefix.ip6)); + + if (mp->is_del) { + u32 vip_index; + if (!(rv = lb_vip_find_index(&prefix, mp->prefix_length, &vip_index))) + rv = lb_vip_del(vip_index); + } else { + u32 vip_index; + lb_vip_type_t type; + if (ip46_prefix_is_ip4(&prefix, mp->prefix_length)) { + type = mp->is_gre4?LB_VIP_TYPE_IP4_GRE4:LB_VIP_TYPE_IP4_GRE6; + } else { + type = mp->is_gre4?LB_VIP_TYPE_IP6_GRE4:LB_VIP_TYPE_IP6_GRE6; + } + + rv = lb_vip_add(&prefix, mp->prefix_length, type, + mp->new_flows_table_length, &vip_index); + } + REPLY_MACRO (VL_API_LB_CONF_REPLY); +} + +static void *vl_api_lb_add_del_vip_t_print +(vl_api_lb_add_del_vip_t *mp, void * handle) +{ + u8 * s; + s = format (0, "SCRIPT: lb_add_del_vip "); + s = format (s, "%U ", format_ip46_prefix, + (ip46_address_t *)mp->ip_prefix, mp->prefix_length, IP46_TYPE_ANY); + s = format (s, "%s ", mp->is_gre4?"gre4":"gre6"); + s = format (s, "%u ", mp->new_flows_table_length); + s = format (s, "%s ", mp->is_del?"del":"add"); + FINISH; +} + +static void +vl_api_lb_add_del_as_t_handler +(vl_api_lb_add_del_as_t * mp) +{ + lb_main_t *lbm = &lb_main; + vl_api_lb_conf_reply_t * rmp; + int rv = 0; + u32 vip_index; + if ((rv = lb_vip_find_index((ip46_address_t *)mp->vip_ip_prefix, + mp->vip_prefix_length, &vip_index))) + goto done; + + if (mp->is_del) + rv = lb_vip_del_ass(vip_index, (ip46_address_t *)mp->as_address, 1); + else + rv = lb_vip_add_ass(vip_index, (ip46_address_t *)mp->as_address, 1); + +done: + REPLY_MACRO (VL_API_LB_CONF_REPLY); +} + +static void *vl_api_lb_add_del_as_t_print +(vl_api_lb_add_del_as_t *mp, void * handle) +{ + u8 * s; + s = format (0, "SCRIPT: lb_add_del_as "); + s = format (s, "%U ", format_ip46_prefix, + (ip46_address_t *)mp->vip_ip_prefix, mp->vip_prefix_length, IP46_TYPE_ANY); + s = format (s, "%U ", format_ip46_address, + (ip46_address_t *)mp->as_address, IP46_TYPE_ANY); + s = format (s, "%s ", mp->is_del?"del":"add"); + FINISH; +} + +/* List of message types that this plugin understands */ +#define foreach_lb_plugin_api_msg \ +_(LB_CONF, lb_conf) \ +_(LB_ADD_DEL_VIP, lb_add_del_vip) \ +_(LB_ADD_DEL_AS, lb_add_del_as) + +static clib_error_t * lb_api_init (vlib_main_t * vm) +{ + lb_main_t *lbm = &lb_main; + u8 *name = format (0, "lb_%08x%c", api_version, 0); + lbm->msg_id_base = vl_msg_api_get_msg_ids + ((char *) name, VL_MSG_FIRST_AVAILABLE); + +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + lbm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_lb_plugin_api_msg; +#undef _ + + /* Add our API messages to the global name_crc hash table */ + setup_message_id_table (lbm, &api_main); + + return 0; +} + +VLIB_INIT_FUNCTION (lb_api_init); diff --git a/src/plugins/lb/cli.c b/src/plugins/lb/cli.c new file mode 100644 index 00000000000..b59c6426241 --- /dev/null +++ b/src/plugins/lb/cli.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/lb.h> +#include <lb/util.h> + +static clib_error_t * +lb_vip_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + ip46_address_t prefix; + u8 plen; + u32 new_len = 1024; + u8 del = 0; + int ret; + u32 gre4 = 0; + lb_vip_type_t type; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + if (!unformat(line_input, "%U", unformat_ip46_prefix, &prefix, &plen, IP46_TYPE_ANY, &plen)) + return clib_error_return (0, "invalid vip prefix: '%U'", + format_unformat_error, line_input); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(line_input, "new_len %d", &new_len)) + ; + else if (unformat(line_input, "del")) + del = 1; + else if (unformat(line_input, "encap gre4")) + gre4 = 1; + else if (unformat(line_input, "encap gre6")) + gre4 = 0; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + + if (ip46_prefix_is_ip4(&prefix, plen)) { + type = (gre4)?LB_VIP_TYPE_IP4_GRE4:LB_VIP_TYPE_IP4_GRE6; + } else { + type = (gre4)?LB_VIP_TYPE_IP6_GRE4:LB_VIP_TYPE_IP6_GRE6; + } + + lb_garbage_collection(); + + u32 index; + if (!del) { + if ((ret = lb_vip_add(&prefix, plen, type, new_len, &index))) { + return clib_error_return (0, "lb_vip_add error %d", ret); + } else { + vlib_cli_output(vm, "lb_vip_add ok %d", index); + } + } else { + if ((ret = lb_vip_find_index(&prefix, plen, &index))) + return clib_error_return (0, "lb_vip_find_index error %d", ret); + else if ((ret = lb_vip_del(index))) + return clib_error_return (0, "lb_vip_del error %d", ret); + } + return NULL; +} + +VLIB_CLI_COMMAND (lb_vip_command, static) = +{ + .path = "lb vip", + .short_help = "lb vip <prefix> [encap (gre6|gre4)] [new_len <n>] [del]", + .function = lb_vip_command_fn, +}; + +static clib_error_t * +lb_as_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + ip46_address_t vip_prefix, as_addr; + u8 vip_plen; + ip46_address_t *as_array = 0; + u32 vip_index; + u8 del = 0; + int ret; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + if (!unformat(line_input, "%U", unformat_ip46_prefix, &vip_prefix, &vip_plen, IP46_TYPE_ANY)) + return clib_error_return (0, "invalid as address: '%U'", + format_unformat_error, line_input); + + if ((ret = lb_vip_find_index(&vip_prefix, vip_plen, &vip_index))) + return clib_error_return (0, "lb_vip_find_index error %d", ret); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(line_input, "%U", unformat_ip46_address, &as_addr, IP46_TYPE_ANY)) { + vec_add1(as_array, as_addr); + } else if (unformat(line_input, "del")) { + del = 1; + } else { + vec_free(as_array); + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + } + + if (!vec_len(as_array)) { + vec_free(as_array); + return clib_error_return (0, "No AS address provided"); + } + + lb_garbage_collection(); + clib_warning("vip index is %d", vip_index); + + if (del) { + if ((ret = lb_vip_del_ass(vip_index, as_array, vec_len(as_array)))) { + vec_free(as_array); + return clib_error_return (0, "lb_vip_del_ass error %d", ret); + } + } else { + if ((ret = lb_vip_add_ass(vip_index, as_array, vec_len(as_array)))) { + vec_free(as_array); + return clib_error_return (0, "lb_vip_add_ass error %d", ret); + } + } + + vec_free(as_array); + return 0; +} + +VLIB_CLI_COMMAND (lb_as_command, static) = +{ + .path = "lb as", + .short_help = "lb as <vip-prefix> [<address> [<address> [...]]] [del]", + .function = lb_as_command_fn, +}; + +static clib_error_t * +lb_conf_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + lb_main_t *lbm = &lb_main; + unformat_input_t _line_input, *line_input = &_line_input; + ip4_address_t ip4 = lbm->ip4_src_address; + ip6_address_t ip6 = lbm->ip6_src_address; + u32 per_cpu_sticky_buckets = lbm->per_cpu_sticky_buckets; + u32 per_cpu_sticky_buckets_log2 = 0; + u32 flow_timeout = lbm->flow_timeout; + int ret; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat(line_input, "ip4-src-address %U", unformat_ip4_address, &ip4)) + ; + else if (unformat(line_input, "ip6-src-address %U", unformat_ip6_address, &ip6)) + ; + else if (unformat(line_input, "buckets %d", &per_cpu_sticky_buckets)) + ; + else if (unformat(line_input, "buckets-log2 %d", &per_cpu_sticky_buckets_log2)) { + if (per_cpu_sticky_buckets_log2 >= 32) + return clib_error_return (0, "buckets-log2 value is too high"); + per_cpu_sticky_buckets = 1 << per_cpu_sticky_buckets_log2; + } else if (unformat(line_input, "timeout %d", &flow_timeout)) + ; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + lb_garbage_collection(); + + if ((ret = lb_conf(&ip4, &ip6, per_cpu_sticky_buckets, flow_timeout))) + return clib_error_return (0, "lb_conf error %d", ret); + + return NULL; +} + +VLIB_CLI_COMMAND (lb_conf_command, static) = +{ + .path = "lb conf", + .short_help = "lb conf [ip4-src-address <addr>] [ip6-src-address <addr>] [buckets <n>] [timeout <s>]", + .function = lb_conf_command_fn, +}; + +static clib_error_t * +lb_show_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vlib_cli_output(vm, "%U", format_lb_main); + return NULL; +} + + +VLIB_CLI_COMMAND (lb_show_command, static) = +{ + .path = "show lb", + .short_help = "show lb", + .function = lb_show_command_fn, +}; + +static clib_error_t * +lb_show_vips_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t line_input; + lb_main_t *lbm = &lb_main; + lb_vip_t *vip; + u8 verbose = 0; + + if (!unformat_user (input, unformat_line_input, &line_input)) + return 0; + + if (unformat(&line_input, "verbose")) + verbose = 1; + + pool_foreach(vip, lbm->vips, { + vlib_cli_output(vm, "%U\n", verbose?format_lb_vip_detailed:format_lb_vip, vip); + }); + + unformat_free (&line_input); + return NULL; +} + +VLIB_CLI_COMMAND (lb_show_vips_command, static) = +{ + .path = "show lb vips", + .short_help = "show lb vips [verbose]", + .function = lb_show_vips_command_fn, +}; diff --git a/src/plugins/lb/lb.api b/src/plugins/lb/lb.api new file mode 100644 index 00000000000..39ee3c8f98b --- /dev/null +++ b/src/plugins/lb/lb.api @@ -0,0 +1,71 @@ +/** \brief Configure Load-Balancer global parameters + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param ip4_src_address - IPv4 address to be used as source for IPv4 GRE traffic. + @param ip6_src_address - IPv6 address to be used as source for IPv6 GRE traffic. + @param n_sticky_buckets - Number of buckets *per worker thread* in the + established flow table (must be power of 2). + @param flow_timeout - Time in seconds after which, if no packet is received + for a given flow, the flow is removed from the established flow table. +*/ +define lb_conf +{ + u32 client_index; + u32 context; + u32 ip4_src_address; + u8 ip6_src_address[16]; + u32 sticky_buckets_per_core; + u32 flow_timeout; +}; + +define lb_conf_reply { + u32 context; + i32 retval; +}; + +/** \brief Add a virtual address (or prefix) + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param ip_prefix - IP address (IPv4 in lower order 32 bits). + @param prefix_length - IP prefix length (96 + 'IPv4 prefix length' for IPv4). + @param is_gre4 - Encap is ip4 GRE (ip6 GRE otherwise). + @param new_flows_table_length - Size of the new connections flow table used + for this VIP (must be power of 2). + @param is_del - The VIP should be removed. +*/ +define lb_add_del_vip { + u32 client_index; + u32 context; + u8 ip_prefix[16]; + u8 prefix_length; + u8 is_gre4; + u32 new_flows_table_length; + u8 is_del; +}; + +define lb_add_del_vip_reply { + u32 context; + i32 retval; +}; + +/** \brief Add an application server for a given VIP + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param vip_ip_prefix - VIP IP address (IPv4 in lower order 32 bits). + @param vip_ip_prefix - VIP IP prefix length (96 + 'IPv4 prefix length' for IPv4). + @param as_address - The application server address (IPv4 in lower order 32 bits). + @param is_del - The AS should be removed. +*/ +define lb_add_del_as { + u32 client_index; + u32 context; + u8 vip_ip_prefix[16]; + u8 vip_prefix_length; + u8 as_address[16]; + u8 is_del; +}; + +define lb_add_del_as_reply { + u32 context; + i32 retval; +}; diff --git a/src/plugins/lb/lb.c b/src/plugins/lb/lb.c new file mode 100644 index 00000000000..1d9b987095b --- /dev/null +++ b/src/plugins/lb/lb.c @@ -0,0 +1,844 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/lb.h> +#include <vnet/plugin/plugin.h> +#include <vnet/api_errno.h> + +//GC runs at most once every so many seconds +#define LB_GARBAGE_RUN 60 + +//After so many seconds. It is assumed that inter-core race condition will not occur. +#define LB_CONCURRENCY_TIMEOUT 10 + +lb_main_t lb_main; + +#define lb_get_writer_lock() do {} while(__sync_lock_test_and_set (lb_main.writer_lock, 1)) +#define lb_put_writer_lock() lb_main.writer_lock[0] = 0 + +static void lb_as_stack (lb_as_t *as); + + +const static char * const lb_dpo_gre4_ip4[] = { "lb4-gre4" , NULL }; +const static char * const lb_dpo_gre4_ip6[] = { "lb6-gre4" , NULL }; +const static char* const * const lb_dpo_gre4_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre4_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre4_ip6, + }; + +const static char * const lb_dpo_gre6_ip4[] = { "lb4-gre6" , NULL }; +const static char * const lb_dpo_gre6_ip6[] = { "lb6-gre6" , NULL }; +const static char* const * const lb_dpo_gre6_nodes[DPO_PROTO_NUM] = + { + [DPO_PROTO_IP4] = lb_dpo_gre6_ip4, + [DPO_PROTO_IP6] = lb_dpo_gre6_ip6, + }; + +u32 lb_hash_time_now(vlib_main_t * vm) +{ + return (u32) (vlib_time_now(vm) + 10000); +} + +u8 *format_lb_main (u8 * s, va_list * args) +{ + vlib_thread_main_t *tm = vlib_get_thread_main(); + lb_main_t *lbm = &lb_main; + s = format(s, "lb_main"); + s = format(s, " ip4-src-address: %U \n", format_ip4_address, &lbm->ip4_src_address); + s = format(s, " ip6-src-address: %U \n", format_ip6_address, &lbm->ip6_src_address); + s = format(s, " #vips: %u\n", pool_elts(lbm->vips)); + s = format(s, " #ass: %u\n", pool_elts(lbm->ass) - 1); + + u32 cpu_index; + for(cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++ ) { + lb_hash_t *h = lbm->per_cpu[cpu_index].sticky_ht; + if (h) { + s = format(s, "core %d\n", cpu_index); + s = format(s, " timeout: %ds\n", h->timeout); + s = format(s, " usage: %d / %d\n", lb_hash_elts(h, lb_hash_time_now(vlib_get_main())), lb_hash_size(h)); + } + } + + return s; +} + +static char *lb_vip_type_strings[] = { + [LB_VIP_TYPE_IP6_GRE6] = "ip6-gre6", + [LB_VIP_TYPE_IP6_GRE4] = "ip6-gre4", + [LB_VIP_TYPE_IP4_GRE6] = "ip4-gre6", + [LB_VIP_TYPE_IP4_GRE4] = "ip4-gre4", +}; + +u8 *format_lb_vip_type (u8 * s, va_list * args) +{ + lb_vip_type_t vipt = va_arg (*args, lb_vip_type_t); + u32 i; + for (i=0; i<LB_VIP_N_TYPES; i++) + if (vipt == i) + return format(s, lb_vip_type_strings[i]); + return format(s, "_WRONG_TYPE_"); +} + +uword unformat_lb_vip_type (unformat_input_t * input, va_list * args) +{ + lb_vip_type_t *vipt = va_arg (*args, lb_vip_type_t *); + u32 i; + for (i=0; i<LB_VIP_N_TYPES; i++) + if (unformat(input, lb_vip_type_strings[i])) { + *vipt = i; + return 1; + } + return 0; +} + +u8 *format_lb_vip (u8 * s, va_list * args) +{ + lb_vip_t *vip = va_arg (*args, lb_vip_t *); + return format(s, "%U %U new_size:%u #as:%u%s", + format_lb_vip_type, vip->type, + format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY, + vip->new_flow_table_mask + 1, + pool_elts(vip->as_indexes), + (vip->flags & LB_VIP_FLAGS_USED)?"":" removed"); +} + +u8 *format_lb_as (u8 * s, va_list * args) +{ + lb_as_t *as = va_arg (*args, lb_as_t *); + return format(s, "%U %s", format_ip46_address, + &as->address, IP46_TYPE_ANY, + (as->flags & LB_AS_FLAGS_USED)?"used":"removed"); +} + +u8 *format_lb_vip_detailed (u8 * s, va_list * args) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip = va_arg (*args, lb_vip_t *); + uword indent = format_get_indent (s); + + s = format(s, "%U %U [%u] %U%s\n" + "%U new_size:%u\n", + format_white_space, indent, + format_lb_vip_type, vip->type, + vip - lbm->vips, format_ip46_prefix, &vip->prefix, vip->plen, IP46_TYPE_ANY, + (vip->flags & LB_VIP_FLAGS_USED)?"":" removed", + format_white_space, indent, + vip->new_flow_table_mask + 1); + + //Print counters + s = format(s, "%U counters:\n", + format_white_space, indent); + u32 i; + for (i=0; i<LB_N_VIP_COUNTERS; i++) + s = format(s, "%U %s: %d\n", + format_white_space, indent, + lbm->vip_counters[i].name, + vlib_get_simple_counter(&lbm->vip_counters[i], vip - lbm->vips)); + + + s = format(s, "%U #as:%u\n", + format_white_space, indent, + pool_elts(vip->as_indexes)); + + //Let's count the buckets for each AS + u32 *count = 0; + vec_validate(count, pool_len(lbm->ass)); //Possibly big alloc for not much... + lb_new_flow_entry_t *nfe; + vec_foreach(nfe, vip->new_flow_table) + count[nfe->as_index]++; + + lb_as_t *as; + u32 *as_index; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + s = format(s, "%U %U %d buckets %d flows dpo:%u %s\n", + format_white_space, indent, + format_ip46_address, &as->address, IP46_TYPE_ANY, + count[as - lbm->ass], + vlib_refcount_get(&lbm->as_refcount, as - lbm->ass), + as->dpo.dpoi_index, + (as->flags & LB_AS_FLAGS_USED)?"used":" removed"); + }); + + vec_free(count); + + /* + s = format(s, "%U new flows table:\n", format_white_space, indent); + lb_new_flow_entry_t *nfe; + vec_foreach(nfe, vip->new_flow_table) { + s = format(s, "%U %d: %d\n", format_white_space, indent, nfe - vip->new_flow_table, nfe->as_index); + } + */ + return s; +} + +typedef struct { + u32 as_index; + u32 last; + u32 skip; +} lb_pseudorand_t; + +static int lb_pseudorand_compare(void *a, void *b) +{ + lb_as_t *asa, *asb; + lb_main_t *lbm = &lb_main; + asa = &lbm->ass[((lb_pseudorand_t *)a)->as_index]; + asb = &lbm->ass[((lb_pseudorand_t *)b)->as_index]; + return memcmp(&asa->address, &asb->address, sizeof(asb->address)); +} + +static void lb_vip_garbage_collection(lb_vip_t *vip) +{ + lb_main_t *lbm = &lb_main; + ASSERT (lbm->writer_lock[0]); + + u32 now = (u32) vlib_time_now(vlib_get_main()); + if (!clib_u32_loop_gt(now, vip->last_garbage_collection + LB_GARBAGE_RUN)) + return; + + vip->last_garbage_collection = now; + lb_as_t *as; + u32 *as_index; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + if (!(as->flags & LB_AS_FLAGS_USED) && //Not used + clib_u32_loop_gt(now, as->last_used + LB_CONCURRENCY_TIMEOUT) && //Not recently used + (vlib_refcount_get(&lbm->as_refcount, as - lbm->ass) == 0)) + { //Not referenced + fib_entry_child_remove(as->next_hop_fib_entry_index, + as->next_hop_child_index); + fib_table_entry_delete_index(as->next_hop_fib_entry_index, + FIB_SOURCE_RR); + as->next_hop_fib_entry_index = FIB_NODE_INDEX_INVALID; + + pool_put(vip->as_indexes, as_index); + pool_put(lbm->ass, as); + } + }); +} + +void lb_garbage_collection() +{ + lb_main_t *lbm = &lb_main; + lb_get_writer_lock(); + lb_vip_t *vip; + u32 *to_be_removed_vips = 0, *i; + pool_foreach(vip, lbm->vips, { + lb_vip_garbage_collection(vip); + + if (!(vip->flags & LB_VIP_FLAGS_USED) && + (pool_elts(vip->as_indexes) == 0)) { + vec_add1(to_be_removed_vips, vip - lbm->vips); + } + }); + + vec_foreach(i, to_be_removed_vips) { + vip = &lbm->vips[*i]; + pool_put(lbm->vips, vip); + pool_free(vip->as_indexes); + } + + vec_free(to_be_removed_vips); + lb_put_writer_lock(); +} + +static void lb_vip_update_new_flow_table(lb_vip_t *vip) +{ + lb_main_t *lbm = &lb_main; + lb_new_flow_entry_t *old_table; + u32 i, *as_index; + lb_new_flow_entry_t *new_flow_table = 0; + lb_as_t *as; + lb_pseudorand_t *pr, *sort_arr = 0; + u32 count; + + ASSERT (lbm->writer_lock[0]); //We must have the lock + + //Check if some AS is configured or not + i = 0; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + if (as->flags & LB_AS_FLAGS_USED) { //Not used anymore + i = 1; + goto out; //Not sure 'break' works in this macro-loop + } + }); + +out: + if (i == 0) { + //Only the default. i.e. no AS + vec_validate(new_flow_table, vip->new_flow_table_mask); + for (i=0; i<vec_len(new_flow_table); i++) + new_flow_table[i].as_index = 0; + + goto finished; + } + + //First, let's sort the ASs + sort_arr = 0; + vec_alloc(sort_arr, pool_elts(vip->as_indexes)); + + i = 0; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + if (!(as->flags & LB_AS_FLAGS_USED)) //Not used anymore + continue; + + sort_arr[i].as_index = as - lbm->ass; + i++; + }); + _vec_len(sort_arr) = i; + + vec_sort_with_function(sort_arr, lb_pseudorand_compare); + + //Now let's pseudo-randomly generate permutations + vec_foreach(pr, sort_arr) { + lb_as_t *as = &lbm->ass[pr->as_index]; + + u64 seed = clib_xxhash(as->address.as_u64[0] ^ + as->address.as_u64[1]); + /* We have 2^n buckets. + * skip must be prime with 2^n. + * So skip must be odd. + * MagLev actually state that M should be prime, + * but this has a big computation cost (% operation). + * Using 2^n is more better (& operation). + */ + pr->skip = ((seed & 0xffffffff) | 1) & vip->new_flow_table_mask; + pr->last = (seed >> 32) & vip->new_flow_table_mask; + } + + //Let's create a new flow table + vec_validate(new_flow_table, vip->new_flow_table_mask); + for (i=0; i<vec_len(new_flow_table); i++) + new_flow_table[i].as_index = ~0; + + u32 done = 0; + while (1) { + vec_foreach(pr, sort_arr) { + while (1) { + u32 last = pr->last; + pr->last = (pr->last + pr->skip) & vip->new_flow_table_mask; + if (new_flow_table[last].as_index == ~0) { + new_flow_table[last].as_index = pr->as_index; + break; + } + } + done++; + if (done == vec_len(new_flow_table)) + goto finished; + } + } + + vec_free(sort_arr); + +finished: + +//Count number of changed entries + count = 0; + for (i=0; i<vec_len(new_flow_table); i++) + if (vip->new_flow_table == 0 || + new_flow_table[i].as_index != vip->new_flow_table[i].as_index) + count++; + + old_table = vip->new_flow_table; + vip->new_flow_table = new_flow_table; + vec_free(old_table); +} + +int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, + u32 per_cpu_sticky_buckets, u32 flow_timeout) +{ + lb_main_t *lbm = &lb_main; + + if (!is_pow2(per_cpu_sticky_buckets)) + return VNET_API_ERROR_INVALID_MEMORY_SIZE; + + lb_get_writer_lock(); //Not exactly necessary but just a reminder that it exists for my future self + lbm->ip4_src_address = *ip4_address; + lbm->ip6_src_address = *ip6_address; + lbm->per_cpu_sticky_buckets = per_cpu_sticky_buckets; + lbm->flow_timeout = flow_timeout; + lb_put_writer_lock(); + return 0; +} + +static +int lb_vip_find_index_with_lock(ip46_address_t *prefix, u8 plen, u32 *vip_index) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip; + ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned + ip46_prefix_normalize(prefix, plen); + pool_foreach(vip, lbm->vips, { + if ((vip->flags & LB_AS_FLAGS_USED) && + vip->plen == plen && + vip->prefix.as_u64[0] == prefix->as_u64[0] && + vip->prefix.as_u64[1] == prefix->as_u64[1]) { + *vip_index = vip - lbm->vips; + return 0; + } + }); + return VNET_API_ERROR_NO_SUCH_ENTRY; +} + +int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index) +{ + int ret; + lb_get_writer_lock(); + ret = lb_vip_find_index_with_lock(prefix, plen, vip_index); + lb_put_writer_lock(); + return ret; +} + +static int lb_as_find_index_vip(lb_vip_t *vip, ip46_address_t *address, u32 *as_index) +{ + lb_main_t *lbm = &lb_main; + ASSERT (lbm->writer_lock[0]); //This must be called with the lock owned + lb_as_t *as; + u32 *asi; + pool_foreach(asi, vip->as_indexes, { + as = &lbm->ass[*asi]; + if (as->vip_index == (vip - lbm->vips) && + as->address.as_u64[0] == address->as_u64[0] && + as->address.as_u64[1] == address->as_u64[1]) { + *as_index = as - lbm->ass; + return 0; + } + }); + return -1; +} + +int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n) +{ + lb_main_t *lbm = &lb_main; + lb_get_writer_lock(); + lb_vip_t *vip; + if (!(vip = lb_vip_get_by_index(vip_index))) { + lb_put_writer_lock(); + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + ip46_type_t type = lb_vip_is_gre4(vip)?IP46_TYPE_IP4:IP46_TYPE_IP6; + u32 *to_be_added = 0; + u32 *to_be_updated = 0; + u32 i; + u32 *ip; + + //Sanity check + while (n--) { + + if (!lb_as_find_index_vip(vip, &addresses[n], &i)) { + if (lbm->ass[i].flags & LB_AS_FLAGS_USED) { + vec_free(to_be_added); + vec_free(to_be_updated); + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + vec_add1(to_be_updated, i); + goto next; + } + + if (ip46_address_type(&addresses[n]) != type) { + vec_free(to_be_added); + vec_free(to_be_updated); + lb_put_writer_lock(); + return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; + } + + if (n) { + u32 n2 = n; + while(n2--) //Check for duplicates + if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] && + addresses[n2].as_u64[1] == addresses[n].as_u64[1]) + goto next; + } + + vec_add1(to_be_added, n); + +next: + continue; + } + + //Update reused ASs + vec_foreach(ip, to_be_updated) { + lbm->ass[*ip].flags = LB_AS_FLAGS_USED; + } + vec_free(to_be_updated); + + //Create those who have to be created + vec_foreach(ip, to_be_added) { + lb_as_t *as; + u32 *as_index; + pool_get(lbm->ass, as); + as->address = addresses[*ip]; + as->flags = LB_AS_FLAGS_USED; + as->vip_index = vip_index; + pool_get(vip->as_indexes, as_index); + *as_index = as - lbm->ass; + + /* + * become a child of the FIB entry + * so we are informed when its forwarding changes + */ + fib_prefix_t nh = {}; + if (lb_vip_is_gre4(vip)) { + nh.fp_addr.ip4 = as->address.ip4; + nh.fp_len = 32; + nh.fp_proto = FIB_PROTOCOL_IP4; + } else { + nh.fp_addr.ip6 = as->address.ip6; + nh.fp_len = 128; + nh.fp_proto = FIB_PROTOCOL_IP6; + } + + as->next_hop_fib_entry_index = + fib_table_entry_special_add(0, + &nh, + FIB_SOURCE_RR, + FIB_ENTRY_FLAG_NONE, + ADJ_INDEX_INVALID); + as->next_hop_child_index = + fib_entry_child_add(as->next_hop_fib_entry_index, + lbm->fib_node_type, + as - lbm->ass); + + lb_as_stack(as); + } + vec_free(to_be_added); + + //Recompute flows + lb_vip_update_new_flow_table(vip); + + //Garbage collection maybe + lb_vip_garbage_collection(vip); + + lb_put_writer_lock(); + return 0; +} + +int lb_vip_del_ass_withlock(u32 vip_index, ip46_address_t *addresses, u32 n) +{ + lb_main_t *lbm = &lb_main; + u32 now = (u32) vlib_time_now(vlib_get_main()); + u32 *ip = 0; + + lb_vip_t *vip; + if (!(vip = lb_vip_get_by_index(vip_index))) { + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + u32 *indexes = NULL; + while (n--) { + u32 i; + if (lb_as_find_index_vip(vip, &addresses[n], &i)) { + vec_free(indexes); + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + if (n) { //Check for duplicates + u32 n2 = n - 1; + while(n2--) { + if (addresses[n2].as_u64[0] == addresses[n].as_u64[0] && + addresses[n2].as_u64[1] == addresses[n].as_u64[1]) + goto next; + } + } + + vec_add1(indexes, i); +next: + continue; + } + + //Garbage collection maybe + lb_vip_garbage_collection(vip); + + if (indexes != NULL) { + vec_foreach(ip, indexes) { + lbm->ass[*ip].flags &= ~LB_AS_FLAGS_USED; + lbm->ass[*ip].last_used = now; + } + + //Recompute flows + lb_vip_update_new_flow_table(vip); + } + + vec_free(indexes); + return 0; +} + +int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n) +{ + lb_get_writer_lock(); + int ret = lb_vip_del_ass_withlock(vip_index, addresses, n); + lb_put_writer_lock(); + return ret; +} + +/** + * Add the VIP adjacency to the ip4 or ip6 fib + */ +static void lb_vip_add_adjacency(lb_main_t *lbm, lb_vip_t *vip) +{ + dpo_proto_t proto = 0; + dpo_id_t dpo = DPO_INVALID; + fib_prefix_t pfx = {}; + if (lb_vip_is_ip4(vip)) { + pfx.fp_addr.ip4 = vip->prefix.ip4; + pfx.fp_len = vip->plen - 96; + pfx.fp_proto = FIB_PROTOCOL_IP4; + proto = DPO_PROTO_IP4; + } else { + pfx.fp_addr.ip6 = vip->prefix.ip6; + pfx.fp_len = vip->plen; + pfx.fp_proto = FIB_PROTOCOL_IP6; + proto = DPO_PROTO_IP6; + } + dpo_set(&dpo, lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type, + proto, vip - lbm->vips); + fib_table_entry_special_dpo_add(0, + &pfx, + FIB_SOURCE_PLUGIN_HI, + FIB_ENTRY_FLAG_EXCLUSIVE, + &dpo); + dpo_reset(&dpo); +} + +/** + * Deletes the adjacency associated with the VIP + */ +static void lb_vip_del_adjacency(lb_main_t *lbm, lb_vip_t *vip) +{ + fib_prefix_t pfx = {}; + if (lb_vip_is_ip4(vip)) { + pfx.fp_addr.ip4 = vip->prefix.ip4; + pfx.fp_len = vip->plen - 96; + pfx.fp_proto = FIB_PROTOCOL_IP4; + } else { + pfx.fp_addr.ip6 = vip->prefix.ip6; + pfx.fp_len = vip->plen; + pfx.fp_proto = FIB_PROTOCOL_IP6; + } + fib_table_entry_special_remove(0, &pfx, FIB_SOURCE_PLUGIN_HI); +} + +int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, u32 new_length, u32 *vip_index) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip; + lb_get_writer_lock(); + ip46_prefix_normalize(prefix, plen); + + if (!lb_vip_find_index_with_lock(prefix, plen, vip_index)) { + lb_put_writer_lock(); + return VNET_API_ERROR_VALUE_EXIST; + } + + if (!is_pow2(new_length)) { + lb_put_writer_lock(); + return VNET_API_ERROR_INVALID_MEMORY_SIZE; + } + + if (ip46_prefix_is_ip4(prefix, plen) && + (type != LB_VIP_TYPE_IP4_GRE4) && + (type != LB_VIP_TYPE_IP4_GRE6)) + return VNET_API_ERROR_INVALID_ADDRESS_FAMILY; + + + //Allocate + pool_get(lbm->vips, vip); + + //Init + vip->prefix = *prefix; + vip->plen = plen; + vip->last_garbage_collection = (u32) vlib_time_now(vlib_get_main()); + vip->type = type; + vip->flags = LB_VIP_FLAGS_USED; + vip->as_indexes = 0; + + //Validate counters + u32 i; + for (i = 0; i < LB_N_VIP_COUNTERS; i++) { + vlib_validate_simple_counter(&lbm->vip_counters[i], vip - lbm->vips); + vlib_zero_simple_counter(&lbm->vip_counters[i], vip - lbm->vips); + } + + //Configure new flow table + vip->new_flow_table_mask = new_length - 1; + vip->new_flow_table = 0; + + //Create a new flow hash table full of the default entry + lb_vip_update_new_flow_table(vip); + + //Create adjacency to direct traffic + lb_vip_add_adjacency(lbm, vip); + + //Return result + *vip_index = vip - lbm->vips; + + lb_put_writer_lock(); + return 0; +} + +int lb_vip_del(u32 vip_index) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip; + lb_get_writer_lock(); + if (!(vip = lb_vip_get_by_index(vip_index))) { + lb_put_writer_lock(); + return VNET_API_ERROR_NO_SUCH_ENTRY; + } + + //FIXME: This operation is actually not working + //We will need to remove state before performing this. + + { + //Remove all ASs + ip46_address_t *ass = 0; + lb_as_t *as; + u32 *as_index; + pool_foreach(as_index, vip->as_indexes, { + as = &lbm->ass[*as_index]; + vec_add1(ass, as->address); + }); + if (vec_len(ass)) + lb_vip_del_ass_withlock(vip_index, ass, vec_len(ass)); + vec_free(ass); + } + + //Delete adjacency + lb_vip_del_adjacency(lbm, vip); + + //Set the VIP as unused + vip->flags &= ~LB_VIP_FLAGS_USED; + + lb_put_writer_lock(); + return 0; +} + +clib_error_t * +vlib_plugin_register (vlib_main_t * vm, + vnet_plugin_handoff_t * h, + int from_early_init) +{ + clib_error_t *error = 0; + return error; +} + + +u8 *format_lb_dpo (u8 * s, va_list * va) +{ + index_t index = va_arg (*va, index_t); + CLIB_UNUSED(u32 indent) = va_arg (*va, u32); + lb_main_t *lbm = &lb_main; + lb_vip_t *vip = pool_elt_at_index (lbm->vips, index); + return format (s, "%U", format_lb_vip, vip); +} + +static void lb_dpo_lock (dpo_id_t *dpo) {} +static void lb_dpo_unlock (dpo_id_t *dpo) {} + +static fib_node_t * +lb_fib_node_get_node (fib_node_index_t index) +{ + lb_main_t *lbm = &lb_main; + lb_as_t *as = pool_elt_at_index (lbm->ass, index); + return (&as->fib_node); +} + +static void +lb_fib_node_last_lock_gone (fib_node_t *node) +{ +} + +static lb_as_t * +lb_as_from_fib_node (fib_node_t *node) +{ + return ((lb_as_t*)(((char*)node) - + STRUCT_OFFSET_OF(lb_as_t, fib_node))); +} + +static void +lb_as_stack (lb_as_t *as) +{ + lb_main_t *lbm = &lb_main; + lb_vip_t *vip = &lbm->vips[as->vip_index]; + dpo_stack(lb_vip_is_gre4(vip)?lbm->dpo_gre4_type:lbm->dpo_gre6_type, + lb_vip_is_ip4(vip)?DPO_PROTO_IP4:DPO_PROTO_IP6, + &as->dpo, + fib_entry_contribute_ip_forwarding( + as->next_hop_fib_entry_index)); +} + +static fib_node_back_walk_rc_t +lb_fib_node_back_walk_notify (fib_node_t *node, + fib_node_back_walk_ctx_t *ctx) +{ + lb_as_stack(lb_as_from_fib_node(node)); + return (FIB_NODE_BACK_WALK_CONTINUE); +} + +clib_error_t * +lb_init (vlib_main_t * vm) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + lb_main_t *lbm = &lb_main; + lb_as_t *default_as; + fib_node_vft_t lb_fib_node_vft = { + .fnv_get = lb_fib_node_get_node, + .fnv_last_lock = lb_fib_node_last_lock_gone, + .fnv_back_walk = lb_fib_node_back_walk_notify, + }; + dpo_vft_t lb_vft = { + .dv_lock = lb_dpo_lock, + .dv_unlock = lb_dpo_unlock, + .dv_format = format_lb_dpo, + }; + + lbm->vips = 0; + lbm->per_cpu = 0; + vec_validate(lbm->per_cpu, tm->n_vlib_mains - 1); + lbm->writer_lock = clib_mem_alloc_aligned (CLIB_CACHE_LINE_BYTES, CLIB_CACHE_LINE_BYTES); + lbm->writer_lock[0] = 0; + lbm->per_cpu_sticky_buckets = LB_DEFAULT_PER_CPU_STICKY_BUCKETS; + lbm->flow_timeout = LB_DEFAULT_FLOW_TIMEOUT; + lbm->ip4_src_address.as_u32 = 0xffffffff; + lbm->ip6_src_address.as_u64[0] = 0xffffffffffffffffL; + lbm->ip6_src_address.as_u64[1] = 0xffffffffffffffffL; + lbm->dpo_gre4_type = dpo_register_new_type(&lb_vft, lb_dpo_gre4_nodes); + lbm->dpo_gre6_type = dpo_register_new_type(&lb_vft, lb_dpo_gre6_nodes); + lbm->fib_node_type = fib_node_register_new_type(&lb_fib_node_vft); + + //Init AS reference counters + vlib_refcount_init(&lbm->as_refcount); + + //Allocate and init default AS. + lbm->ass = 0; + pool_get(lbm->ass, default_as); + default_as->flags = 0; + default_as->dpo.dpoi_next_node = LB_NEXT_DROP; + default_as->vip_index = ~0; + default_as->address.ip6.as_u64[0] = 0xffffffffffffffffL; + default_as->address.ip6.as_u64[1] = 0xffffffffffffffffL; + +#define _(a,b,c) lbm->vip_counters[c].name = b; + lb_foreach_vip_counter +#undef _ + return NULL; +} + +VLIB_INIT_FUNCTION (lb_init); diff --git a/src/plugins/lb/lb.h b/src/plugins/lb/lb.h new file mode 100644 index 00000000000..882b9b30f7e --- /dev/null +++ b/src/plugins/lb/lb.h @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * lb-plugin implements a MagLev-like load balancer. + * http://research.google.com/pubs/pub44824.html + * + * It hasn't been tested for interoperability with the original MagLev + * but intends to provide similar functionality. + * The load-balancer receives traffic destined to VIP (Virtual IP) + * addresses from one or multiple(ECMP) routers. + * The load-balancer tunnels the traffic toward many application servers + * ensuring session stickyness (i.e. that a single sessions is tunneled + * towards a single application server). + * + */ + +#ifndef LB_PLUGIN_LB_LB_H_ +#define LB_PLUGIN_LB_LB_H_ + +#include <lb/util.h> +#include <lb/refcount.h> + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/dpo/dpo.h> +#include <vnet/fib/fib_table.h> + +#include <lb/lbhash.h> + +#define LB_DEFAULT_PER_CPU_STICKY_BUCKETS 1 << 10 +#define LB_DEFAULT_FLOW_TIMEOUT 40 + +typedef enum { + LB_NEXT_DROP, + LB_N_NEXT, +} lb_next_t; + +/** + * Each VIP is configured with a set of + * application server. + */ +typedef struct { + /** + * Registration to FIB event. + */ + fib_node_t fib_node; + + /** + * Destination address used to tunnel traffic towards + * that application server. + * The address is also used as ID and pseudo-random + * seed for the load-balancing process. + */ + ip46_address_t address; + + /** + * ASs are indexed by address and VIP Index. + * Which means there will be duplicated if the same server + * address is used for multiple VIPs. + */ + u32 vip_index; + + /** + * Some per-AS flags. + * For now only LB_AS_FLAGS_USED is defined. + */ + u8 flags; + +#define LB_AS_FLAGS_USED 0x1 + + /** + * Rotating timestamp of when LB_AS_FLAGS_USED flag was last set. + * + * AS removal is based on garbage collection and reference counting. + * When an AS is removed, there is a race between configuration core + * and worker cores which may still add a reference while it should not + * be used. This timestamp is used to not remove the AS while a race condition + * may happen. + */ + u32 last_used; + + /** + * The FIB entry index for the next-hop + */ + fib_node_index_t next_hop_fib_entry_index; + + /** + * The child index on the FIB entry + */ + u32 next_hop_child_index; + + /** + * The next DPO in the graph to follow. + */ + dpo_id_t dpo; + +} lb_as_t; + +format_function_t format_lb_as; + +typedef struct { + u32 as_index; +} lb_new_flow_entry_t; + +#define lb_foreach_vip_counter \ + _(NEXT_PACKET, "packet from existing sessions", 0) \ + _(FIRST_PACKET, "first session packet", 1) \ + _(UNTRACKED_PACKET, "untracked packet", 2) \ + _(NO_SERVER, "no server configured", 3) + +typedef enum { +#define _(a,b,c) LB_VIP_COUNTER_##a = c, + lb_foreach_vip_counter +#undef _ + LB_N_VIP_COUNTERS +} lb_vip_counter_t; + +/** + * The load balancer supports IPv4 and IPv6 traffic + * and GRE4 and GRE6 encap. + */ +typedef enum { + LB_VIP_TYPE_IP6_GRE6, + LB_VIP_TYPE_IP6_GRE4, + LB_VIP_TYPE_IP4_GRE6, + LB_VIP_TYPE_IP4_GRE4, + LB_VIP_N_TYPES, +} lb_vip_type_t; + +format_function_t format_lb_vip_type; +unformat_function_t unformat_lb_vip_type; + +/** + * Load balancing service is provided per VIP. + * In this data model, a VIP can be a whole prefix. + * But load balancing only + * occurs on a per-source-address/port basis. Meaning that if a given source + * reuses the same port for multiple destinations within the same VIP, + * they will be considered as a single flow. + */ +typedef struct { + + //Runtime + + /** + * Vector mapping (flow-hash & new_connect_table_mask) to AS index. + * This is used for new flows. + */ + lb_new_flow_entry_t *new_flow_table; + + /** + * New flows table length - 1 + * (length MUST be a power of 2) + */ + u32 new_flow_table_mask; + + /** + * Last time garbage collection was run to free the ASs. + */ + u32 last_garbage_collection; + + //Not runtime + + /** + * A Virtual IP represents a given service delivered + * by a set of application servers. It can be a single + * address or a prefix. + * IPv4 prefixes are encoded using IPv4-in-IPv6 embedded address + * (i.e. ::/96 prefix). + */ + ip46_address_t prefix; + + /** + * The VIP prefix length. + * In case of IPv4, plen = 96 + ip4_plen. + */ + u8 plen; + + /** + * The type of traffic for this. + * LB_TYPE_UNDEFINED if unknown. + */ + lb_vip_type_t type; + + /** + * Flags related to this VIP. + * LB_VIP_FLAGS_USED means the VIP is active. + * When it is not set, the VIP in the process of being removed. + * We cannot immediately remove a VIP because the VIP index still may be stored + * in the adjacency index. + */ + u8 flags; +#define LB_VIP_FLAGS_USED 0x1 + + /** + * Pool of AS indexes used for this VIP. + * This also includes ASs that have been removed (but are still referenced). + */ + u32 *as_indexes; +} lb_vip_t; + +#define lb_vip_is_ip4(vip) ((vip)->type == LB_VIP_TYPE_IP4_GRE6 || (vip)->type == LB_VIP_TYPE_IP4_GRE4) +#define lb_vip_is_gre4(vip) ((vip)->type == LB_VIP_TYPE_IP6_GRE4 || (vip)->type == LB_VIP_TYPE_IP4_GRE4) +format_function_t format_lb_vip; +format_function_t format_lb_vip_detailed; + +typedef struct { + /** + * Each CPU has its own sticky flow hash table. + * One single table is used for all VIPs. + */ + lb_hash_t *sticky_ht; +} lb_per_cpu_t; + +typedef struct { + /** + * Pool of all Virtual IPs + */ + lb_vip_t *vips; + + /** + * Pool of ASs. + * ASs are referenced by address and vip index. + * The first element (index 0) is special and used only to fill + * new_flow_tables when no AS has been configured. + */ + lb_as_t *ass; + + /** + * Each AS has an associated reference counter. + * As ass[0] has a special meaning, its associated counter + * starts at 0 and is decremented instead. i.e. do not use it. + */ + vlib_refcount_t as_refcount; + + /** + * Some global data is per-cpu + */ + lb_per_cpu_t *per_cpu; + + /** + * Node next index for IP adjacencies, for each of the traffic types. + */ + u32 ip_lookup_next_index[LB_VIP_N_TYPES]; + + /** + * Source address used in IPv6 encapsulated traffic + */ + ip6_address_t ip6_src_address; + + /** + * Source address used for IPv4 encapsulated traffic + */ + ip4_address_t ip4_src_address; + + /** + * Number of buckets in the per-cpu sticky hash table. + */ + u32 per_cpu_sticky_buckets; + + /** + * Flow timeout in seconds. + */ + u32 flow_timeout; + + /** + * Per VIP counter + */ + vlib_simple_counter_main_t vip_counters[LB_N_VIP_COUNTERS]; + + /** + * DPO used to send packet from IP4/6 lookup to LB node. + */ + dpo_type_t dpo_gre4_type; + dpo_type_t dpo_gre6_type; + + /** + * Node type for registering to fib changes. + */ + fib_node_type_t fib_node_type; + + /** + * API dynamically registered base ID. + */ + u16 msg_id_base; + + volatile u32 *writer_lock; +} lb_main_t; + +extern lb_main_t lb_main; +extern vlib_node_registration_t lb6_node; +extern vlib_node_registration_t lb4_node; + +/** + * Fix global load-balancer parameters. + * @param ip4_address IPv4 source address used for encapsulated traffic + * @param ip6_address IPv6 source address used for encapsulated traffic + * @return 0 on success. VNET_LB_ERR_XXX on error + */ +int lb_conf(ip4_address_t *ip4_address, ip6_address_t *ip6_address, + u32 sticky_buckets, u32 flow_timeout); + +int lb_vip_add(ip46_address_t *prefix, u8 plen, lb_vip_type_t type, + u32 new_length, u32 *vip_index); +int lb_vip_del(u32 vip_index); + +int lb_vip_find_index(ip46_address_t *prefix, u8 plen, u32 *vip_index); + +#define lb_vip_get_by_index(index) (pool_is_free_index(lb_main.vips, index)?NULL:pool_elt_at_index(lb_main.vips, index)) + +int lb_vip_add_ass(u32 vip_index, ip46_address_t *addresses, u32 n); +int lb_vip_del_ass(u32 vip_index, ip46_address_t *addresses, u32 n); + +u32 lb_hash_time_now(vlib_main_t * vm); + +void lb_garbage_collection(); + +format_function_t format_lb_main; + +#endif /* LB_PLUGIN_LB_LB_H_ */ diff --git a/src/plugins/lb/lb_plugin_doc.md b/src/plugins/lb/lb_plugin_doc.md new file mode 100644 index 00000000000..c7885ffb837 --- /dev/null +++ b/src/plugins/lb/lb_plugin_doc.md @@ -0,0 +1,141 @@ +# Load Balancer plugin for VPP {#lb_plugin_doc} + +## Version + +The load balancer plugin is currently in *beta* version. +Both CLIs and APIs are subject to *heavy* changes. +Wich also means feedback is really welcome regarding features, apis, etc... + +## Overview + +This plugin provides load balancing for VPP in a way that is largely inspired +from Google's MagLev: http://research.google.com/pubs/pub44824.html + +The load balancer is configured with a set of Virtual IPs (VIP, which can be +prefixes), and for each VIP, with a set of Application Server addresses (ASs). + +Traffic received for a given VIP (or VIP prefix) is tunneled using GRE towards +the different ASs in a way that (tries to) ensure that a given session will +always be tunneled to the same AS. + +Both VIPs or ASs can be IPv4 or IPv6, but for a given VIP, all ASs must be using +the same encap. type (i.e. IPv4+GRE or IPv6+GRE). Meaning that for a given VIP, +all AS addresses must be of the same family. + +## Performances + +The load balancer has been tested up to 1 millions flows and still forwards more +than 3Mpps per core in such circumstances. +Although 3Mpps seems already good, it is likely that performances will be improved +in next versions. + +## Configuration + +### Global LB parameters + +The load balancer needs to be configured with some parameters: + + lb conf [ip4-src-address <addr>] [ip6-src-address <addr>] + [buckets <n>] [timeout <s>] + +ip4-src-address: the source address used to send encap. packets using IPv4. + +ip6-src-address: the source address used to send encap. packets using IPv6. + +buckets: the *per-thread* established-connexions-table number of buckets. + +timeout: the number of seconds a connection will remain in the + established-connexions-table while no packet for this flow + is received. + + +### Configure the VIPs + + lb vip <prefix> [encap (gre6|gre4)] [new_len <n>] [del] + +new_len is the size of the new-connection-table. It should be 1 or 2 orders of +magnitude bigger than the number of ASs for the VIP in order to ensure a good +load balancing. + +Examples: + + lb vip 2002::/16 encap gre6 new_len 1024 + lb vip 2003::/16 encap gre4 new_len 2048 + lb vip 80.0.0.0/8 encap gre6 new_len 16 + lb vip 90.0.0.0/8 encap gre4 new_len 1024 + +### Configure the ASs (for each VIP) + + lb as <vip-prefix> [<address> [<address> [...]]] [del] + +You can add (or delete) as many ASs at a time (for a single VIP). +Note that the AS address family must correspond to the VIP encap. IP family. + +Examples: + + lb as 2002::/16 2001::2 2001::3 2001::4 + lb as 2003::/16 10.0.0.1 10.0.0.2 + lb as 80.0.0.0/8 2001::2 + lb as 90.0.0.0/8 10.0.0.1 + + + +## Monitoring + +The plugin provides quite a bunch of counters and information. +These are still subject to quite significant changes. + + show lb + show lb vip + show lb vip verbose + + show node counters + + +## Design notes + +### Multi-Threading + +MagLev is a distributed system which pseudo-randomly generates a +new-connections-table based on AS names such that each server configured with +the same set of ASs ends up with the same table. Connection stickyness is then +ensured with an established-connections-table. Using ECMP, it is assumed (but +not relied on) that servers will mostly receive traffic for different flows. + +This implementation pushes the parallelism a little bit further by using +one established-connections table per thread. This is equivalent to assuming +that RSS will make a job similar to ECMP, and is pretty useful as threads don't +need to get a lock in order to write in the table. + +### Hash Table + +A load balancer requires an efficient read and write hash table. The hash table +used by ip6-forward is very read-efficient, but not so much for writing. In +addition, it is not a big deal if writing into the hash table fails (again, +MagLev uses a flow table but does not heaviliy relies on it). + +The plugin therefore uses a very specific (and stupid) hash table. + - Fixed (and power of 2) number of buckets (configured at runtime) + - Fixed (and power of 2) elements per buckets (configured at compilation time) + +### Reference counting + +When an AS is removed, there is two possible ways to react. + - Keep using the AS for established connections + - Change AS for established connections (likely to cause error for TCP) + +In the first case, although an AS is removed from the configuration, its +associated state needs to stay around as long as it is used by at least one +thread. + +In order to avoid locks, a specific reference counter is used. The design is quite +similar to clib counters but: + - It is possible to decrease the value + - Summing will not zero the per-thread counters + - Only the thread can reallocate its own counters vector (to avoid concurrency issues) + +This reference counter is lock free, but reading a count of 0 does not mean +the value can be freed unless it is ensured by *other* means that no other thread +is concurrently referencing the object. In the case of this plugin, it is assumed +that no concurrent event will take place after a few seconds. + diff --git a/src/plugins/lb/lb_test.c b/src/plugins/lb/lb_test.c new file mode 100644 index 00000000000..8c2eaa91ce9 --- /dev/null +++ b/src/plugins/lb/lb_test.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vat/vat.h> +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vlibsocket/api.h> +#include <vppinfra/error.h> +#include <lb/lb.h> + +//TODO: Move that to vat/plugin_api.c +////////////////////////// +uword unformat_ip46_address (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + ip46_type_t type = va_arg (*args, ip46_type_t); + if ((type != IP46_TYPE_IP6) && + unformat(input, "%U", unformat_ip4_address, &ip46->ip4)) { + ip46_address_mask_ip4(ip46); + return 1; + } else if ((type != IP46_TYPE_IP4) && + unformat(input, "%U", unformat_ip6_address, &ip46->ip6)) { + return 1; + } + return 0; +} +uword unformat_ip46_prefix (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + u8 *len = va_arg (*args, u8 *); + ip46_type_t type = va_arg (*args, ip46_type_t); + + u32 l; + if ((type != IP46_TYPE_IP6) && unformat(input, "%U/%u", unformat_ip4_address, &ip46->ip4, &l)) { + if (l > 32) + return 0; + *len = l + 96; + ip46->pad[0] = ip46->pad[1] = ip46->pad[2] = 0; + } else if ((type != IP46_TYPE_IP4) && unformat(input, "%U/%u", unformat_ip6_address, &ip46->ip6, &l)) { + if (l > 128) + return 0; + *len = l; + } else { + return 0; + } + return 1; +} +///////////////////////// + +#define vl_msg_id(n,h) n, +typedef enum { +#include <lb/lb.api.h> + /* We'll want to know how many messages IDs we need... */ + VL_MSG_FIRST_AVAILABLE, +} vl_msg_id_t; +#undef vl_msg_id + +/* define message structures */ +#define vl_typedefs +#include <lb/lb.api.h> +#undef vl_typedefs + +/* declare message handlers for each api */ + +#define vl_endianfun /* define message structures */ +#include <lb/lb.api.h> +#undef vl_endianfun + +/* instantiate all the print functions we know about */ +#define vl_print(handle, ...) +#define vl_printfun +#include <lb/lb.api.h> +#undef vl_printfun + +/* Get the API version number. */ +#define vl_api_version(n,v) static u32 api_version=(v); +#include <lb/lb.api.h> +#undef vl_api_version + +typedef struct { + /* API message ID base */ + u16 msg_id_base; + vat_main_t *vat_main; +} lb_test_main_t; + +lb_test_main_t lb_test_main; + +#define foreach_standard_reply_retval_handler \ +_(lb_conf_reply) \ +_(lb_add_del_vip_reply) \ +_(lb_add_del_as_reply) + +#define _(n) \ + static void vl_api_##n##_t_handler \ + (vl_api_##n##_t * mp) \ + { \ + vat_main_t * vam = lb_test_main.vat_main; \ + i32 retval = ntohl(mp->retval); \ + if (vam->async_mode) { \ + vam->async_errors += (retval < 0); \ + } else { \ + vam->retval = retval; \ + vam->result_ready = 1; \ + } \ + } +foreach_standard_reply_retval_handler; +#undef _ + +/* + * Table of message reply handlers, must include boilerplate handlers + * we just generated + */ +#define foreach_vpe_api_reply_msg \ + _(LB_CONF_REPLY, lb_conf_reply) \ + _(LB_ADD_DEL_VIP_REPLY, lb_add_del_vip_reply) \ + _(LB_ADD_DEL_AS_REPLY, lb_add_del_as_reply) + +/* M: construct, but don't yet send a message */ +#define M(T,t) \ +do { \ + vam->result_ready = 0; \ + mp = vl_msg_api_alloc(sizeof(*mp)); \ + memcpy (mp, &mps, sizeof (*mp)); \ + mp->_vl_msg_id = ntohs (VL_API_##T + lbtm->msg_id_base); \ + mp->client_index = vam->my_client_index; \ +} while(0); + +/* S: send a message */ +#define S (vl_msg_api_send_shmem (vam->vl_input_queue, (u8 *)&mp)) + +/* W: wait for results, with timeout */ +#define W \ +do { \ + timeout = vat_time_now (vam) + 1.0; \ + \ + while (vat_time_now (vam) < timeout) { \ + if (vam->result_ready == 1) { \ + return (vam->retval); \ + } \ + } \ + return -99; \ +} while(0); + +static int api_lb_conf (vat_main_t * vam) +{ + lb_test_main_t *lbtm = &lb_test_main; + unformat_input_t *i = vam->input; + f64 timeout; + vl_api_lb_conf_t mps, *mp; + + if (!unformat(i, "%U %U %u %u", + unformat_ip4_address, &mps.ip4_src_address, + unformat_ip6_address, mps.ip6_src_address, + &mps.sticky_buckets_per_core, + &mps.flow_timeout)) { + errmsg ("invalid arguments\n"); + return -99; + } + + M(LB_CONF, lb_conf); S; W; + + /* NOTREACHED */ + return 0; +} + +static int api_lb_add_del_vip (vat_main_t * vam) +{ + lb_test_main_t *lbtm = &lb_test_main; + unformat_input_t * i = vam->input; + f64 timeout; + vl_api_lb_add_del_vip_t mps, *mp; + mps.is_del = 0; + mps.is_gre4 = 0; + + if (!unformat(i, "%U", + unformat_ip46_prefix, mps.ip_prefix, &mps.prefix_length, IP46_TYPE_ANY)) { + errmsg ("invalid prefix\n"); + return -99; + } + + if (unformat(i, "gre4")) { + mps.is_gre4 = 1; + } else if (unformat(i, "gre6")) { + mps.is_gre4 = 0; + } else { + errmsg ("no encap\n"); + return -99; + } + + if (!unformat(i, "%d", &mps.new_flows_table_length)) { + errmsg ("no table lentgh\n"); + return -99; + } + + if (unformat(i, "del")) { + mps.is_del = 1; + } + + M(LB_ADD_DEL_VIP, lb_add_del_vip); S; W; + /* NOTREACHED */ + return 0; +} + +static int api_lb_add_del_as (vat_main_t * vam) +{ + lb_test_main_t *lbtm = &lb_test_main; + unformat_input_t * i = vam->input; + f64 timeout; + vl_api_lb_add_del_as_t mps, *mp; + mps.is_del = 0; + + if (!unformat(i, "%U %U", + unformat_ip46_prefix, mps.vip_ip_prefix, &mps.vip_prefix_length, IP46_TYPE_ANY, + unformat_ip46_address, mps.as_address)) { + errmsg ("invalid prefix or address\n"); + return -99; + } + + if (unformat(i, "del")) { + mps.is_del = 1; + } + + M(LB_ADD_DEL_AS, lb_add_del_as); S; W; + /* NOTREACHED */ + return 0; +} + +/* + * List of messages that the api test plugin sends, + * and that the data plane plugin processes + */ +#define foreach_vpe_api_msg \ +_(lb_conf, "<ip4-src-addr> <ip6-src-address> <sticky_buckets_per_core> <flow_timeout>") \ +_(lb_add_del_vip, "<ip-prefix> [gre4|gre6] <new_table_len> [del]") \ +_(lb_add_del_as, "<vip-ip-prefix> <address> [del]") + +void vat_api_hookup (vat_main_t *vam) +{ + lb_test_main_t * lbtm = &lb_test_main; + /* Hook up handlers for replies from the data plane plug-in */ +#define _(N,n) \ + vl_msg_api_set_handlers((VL_API_##N + lbtm->msg_id_base), \ + #n, \ + vl_api_##n##_t_handler, \ + vl_noop_handler, \ + vl_api_##n##_t_endian, \ + vl_api_##n##_t_print, \ + sizeof(vl_api_##n##_t), 1); + foreach_vpe_api_reply_msg; +#undef _ + + /* API messages we can send */ +#define _(n,h) hash_set_mem (vam->function_by_name, #n, api_##n); + foreach_vpe_api_msg; +#undef _ + + /* Help strings */ +#define _(n,h) hash_set_mem (vam->help_by_name, #n, h); + foreach_vpe_api_msg; +#undef _ +} + +clib_error_t * vat_plugin_register (vat_main_t *vam) +{ + lb_test_main_t * lbtm = &lb_test_main; + + u8 * name; + + lbtm->vat_main = vam; + + /* Ask the vpp engine for the first assigned message-id */ + name = format (0, "lb_%08x%c", api_version, 0); + lbtm->msg_id_base = vl_client_get_first_plugin_msg_id ((char *) name); + + if (lbtm->msg_id_base != (u16) ~0) + vat_api_hookup (vam); + + vec_free(name); + + return 0; +} diff --git a/src/plugins/lb/lbhash.h b/src/plugins/lb/lbhash.h new file mode 100644 index 00000000000..ca3cc143dc2 --- /dev/null +++ b/src/plugins/lb/lbhash.h @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2012 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * vppinfra already includes tons of different hash tables. + * MagLev flow table is a bit different. It has to be very efficient + * for both writing and reading operations. But it does not need to + * be 100% reliable (write can fail). It also needs to recycle + * old entries in a lazy way. + * + * This hash table is the most dummy hash table you can do. + * Fixed total size, fixed bucket size. + * Advantage is that it could be very efficient (maybe). + * + */ + +#ifndef LB_PLUGIN_LB_LBHASH_H_ +#define LB_PLUGIN_LB_LBHASH_H_ + +#include <vnet/vnet.h> + +#if defined (__SSE4_2__) +#include <immintrin.h> +#endif + +/* + * @brief Number of entries per bucket. + */ +#define LBHASH_ENTRY_PER_BUCKET 4 + +#define LB_HASH_DO_NOT_USE_SSE_BUCKETS 0 + +/* + * @brief One bucket contains 4 entries. + * Each bucket takes one 64B cache line in memory. + */ +typedef struct { + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + u32 hash[LBHASH_ENTRY_PER_BUCKET]; + u32 timeout[LBHASH_ENTRY_PER_BUCKET]; + u32 vip[LBHASH_ENTRY_PER_BUCKET]; + u32 value[LBHASH_ENTRY_PER_BUCKET]; +} lb_hash_bucket_t; + +typedef struct { + u32 buckets_mask; + u32 timeout; + lb_hash_bucket_t buckets[]; +} lb_hash_t; + +#define lb_hash_nbuckets(h) (((h)->buckets_mask) + 1) +#define lb_hash_size(h) ((h)->buckets_mask + LBHASH_ENTRY_PER_BUCKET) + +#define lb_hash_foreach_bucket(h, bucket) \ + for (bucket = (h)->buckets; \ + bucket < (h)->buckets + lb_hash_nbuckets(h); \ + bucket++) + +#define lb_hash_foreach_entry(h, bucket, i) \ + lb_hash_foreach_bucket(h, bucket) \ + for (i = 0; i < LBHASH_ENTRY_PER_BUCKET; i++) + +#define lb_hash_foreach_valid_entry(h, bucket, i, now) \ + lb_hash_foreach_entry(h, bucket, i) \ + if (!clib_u32_loop_gt((now), bucket->timeout[i])) + +static_always_inline +lb_hash_t *lb_hash_alloc(u32 buckets, u32 timeout) +{ + if (!is_pow2(buckets)) + return NULL; + + // Allocate 1 more bucket for prefetch + u32 size = ((u64)&((lb_hash_t *)(0))->buckets[0]) + + sizeof(lb_hash_bucket_t) * (buckets + 1); + u8 *mem = 0; + lb_hash_t *h; + vec_alloc_aligned(mem, size, CLIB_CACHE_LINE_BYTES); + h = (lb_hash_t *)mem; + h->buckets_mask = (buckets - 1); + h->timeout = timeout; + return h; +} + +static_always_inline +void lb_hash_free(lb_hash_t *h) +{ + u8 *mem = (u8 *)h; + vec_free(mem); +} + +#if __SSE4_2__ +static_always_inline +u32 lb_hash_hash(u64 k0, u64 k1, u64 k2, u64 k3, u64 k4) +{ + u64 val = 0; + val = _mm_crc32_u64(val, k0); + val = _mm_crc32_u64(val, k1); + val = _mm_crc32_u64(val, k2); + val = _mm_crc32_u64(val, k3); + val = _mm_crc32_u64(val, k4); + return (u32) val; +} +#else +static_always_inline +u32 lb_hash_hash(u64 k0, u64 k1, u64 k2, u64 k3, u64 k4) +{ + u64 tmp = k0 ^ k1 ^ k2 ^ k3 ^ k4; + return (u32)clib_xxhash (tmp); +} +#endif + +static_always_inline +void lb_hash_prefetch_bucket(lb_hash_t *ht, u32 hash) +{ + lb_hash_bucket_t *bucket = &ht->buckets[hash & ht->buckets_mask]; + CLIB_PREFETCH(bucket, sizeof(*bucket), READ); +} + +static_always_inline +void lb_hash_get(lb_hash_t *ht, u32 hash, u32 vip, u32 time_now, + u32 *available_index, u32 *found_value) +{ + lb_hash_bucket_t *bucket = &ht->buckets[hash & ht->buckets_mask]; + *found_value = ~0; + *available_index = ~0; +#if __SSE4_2__ && LB_HASH_DO_NOT_USE_SSE_BUCKETS == 0 + u32 bitmask, found_index; + __m128i mask; + + // mask[*] = timeout[*] > now + mask = _mm_cmpgt_epi32(_mm_loadu_si128 ((__m128i *) bucket->timeout), + _mm_set1_epi32 (time_now)); + // bitmask[*] = now <= timeout[*/4] + bitmask = (~_mm_movemask_epi8(mask)) & 0xffff; + // Get first index with now <= timeout[*], if any. + *available_index = (bitmask)?__builtin_ctz(bitmask)/4:*available_index; + + // mask[*] = (timeout[*] > now) && (hash[*] == hash) + mask = _mm_and_si128(mask, + _mm_cmpeq_epi32( + _mm_loadu_si128 ((__m128i *) bucket->hash), + _mm_set1_epi32 (hash))); + + // Load the array of vip values + // mask[*] = (timeout[*] > now) && (hash[*] == hash) && (vip[*] == vip) + mask = _mm_and_si128(mask, + _mm_cmpeq_epi32( + _mm_loadu_si128 ((__m128i *) bucket->vip), + _mm_set1_epi32 (vip))); + + // mask[*] = (timeout[*x4] > now) && (hash[*x4] == hash) && (vip[*x4] == vip) + bitmask = _mm_movemask_epi8(mask); + // Get first index, if any + found_index = (bitmask)?__builtin_ctzll(bitmask)/4:0; + ASSERT(found_index < 4); + *found_value = (bitmask)?bucket->value[found_index]:*found_value; + bucket->timeout[found_index] = + (bitmask)?time_now + ht->timeout:bucket->timeout[found_index]; +#else + u32 i; + for (i = 0; i < LBHASH_ENTRY_PER_BUCKET; i++) { + u8 cmp = (bucket->hash[i] == hash && bucket->vip[i] == vip); + u8 timeouted = clib_u32_loop_gt(time_now, bucket->timeout[i]); + *found_value = (cmp || timeouted)?*found_value:bucket->value[i]; + bucket->timeout[i] = (cmp || timeouted)?time_now + ht->timeout:bucket->timeout[i]; + *available_index = (timeouted && (*available_index == ~0))?i:*available_index; + + if (!cmp) + return; + } +#endif +} + +static_always_inline +u32 lb_hash_available_value(lb_hash_t *h, u32 hash, u32 available_index) +{ + return h->buckets[hash & h->buckets_mask].value[available_index]; +} + +static_always_inline +void lb_hash_put(lb_hash_t *h, u32 hash, u32 value, u32 vip, + u32 available_index, u32 time_now) +{ + lb_hash_bucket_t *bucket = &h->buckets[hash & h->buckets_mask]; + bucket->hash[available_index] = hash; + bucket->value[available_index] = value; + bucket->timeout[available_index] = time_now + h->timeout; + bucket->vip[available_index] = vip; +} + +static_always_inline +u32 lb_hash_elts(lb_hash_t *h, u32 time_now) +{ + u32 tot = 0; + lb_hash_bucket_t *bucket; + u32 i; + lb_hash_foreach_valid_entry(h, bucket, i, time_now) { + tot++; + } + return tot; +} + +#endif /* LB_PLUGIN_LB_LBHASH_H_ */ diff --git a/src/plugins/lb/node.c b/src/plugins/lb/node.c new file mode 100644 index 00000000000..8b763c537d5 --- /dev/null +++ b/src/plugins/lb/node.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/lb.h> + +#include <vnet/gre/packet.h> +#include <lb/lbhash.h> + +#define foreach_lb_error \ + _(NONE, "no error") \ + _(PROTO_NOT_SUPPORTED, "protocol not supported") + +typedef enum { +#define _(sym,str) LB_ERROR_##sym, + foreach_lb_error +#undef _ + LB_N_ERROR, +} lb_error_t; + +static char *lb_error_strings[] = { +#define _(sym,string) string, + foreach_lb_error +#undef _ +}; + +typedef struct { + u32 vip_index; + u32 as_index; +} lb_trace_t; + +u8 * +format_lb_trace (u8 * s, va_list * args) +{ + lb_main_t *lbm = &lb_main; + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + lb_trace_t *t = va_arg (*args, lb_trace_t *); + if (pool_is_free_index(lbm->vips, t->vip_index)) { + s = format(s, "lb vip[%d]: This VIP was freed since capture\n"); + } else { + s = format(s, "lb vip[%d]: %U\n", t->vip_index, format_lb_vip, &lbm->vips[t->vip_index]); + } + if (pool_is_free_index(lbm->ass, t->as_index)) { + s = format(s, "lb as[%d]: This AS was freed since capture\n"); + } else { + s = format(s, "lb as[%d]: %U\n", t->as_index, format_lb_as, &lbm->ass[t->as_index]); + } + return s; +} + +lb_hash_t *lb_get_sticky_table(u32 cpu_index) +{ + lb_main_t *lbm = &lb_main; + lb_hash_t *sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + //Check if size changed + if (PREDICT_FALSE(sticky_ht && (lbm->per_cpu_sticky_buckets != lb_hash_nbuckets(sticky_ht)))) + { + //Dereference everything in there + lb_hash_bucket_t *b; + u32 i; + lb_hash_foreach_entry(sticky_ht, b, i) { + vlib_refcount_add(&lbm->as_refcount, cpu_index, b->value[i], -1); + vlib_refcount_add(&lbm->as_refcount, cpu_index, 0, 1); + } + + lb_hash_free(sticky_ht); + sticky_ht = NULL; + } + + //Create if necessary + if (PREDICT_FALSE(sticky_ht == NULL)) { + lbm->per_cpu[cpu_index].sticky_ht = lb_hash_alloc(lbm->per_cpu_sticky_buckets, lbm->flow_timeout); + sticky_ht = lbm->per_cpu[cpu_index].sticky_ht; + clib_warning("Regenerated sticky table %p", sticky_ht); + } + + ASSERT(sticky_ht); + + //Update timeout + sticky_ht->timeout = lbm->flow_timeout; + return sticky_ht; +} + +u64 +lb_node_get_other_ports4(ip4_header_t *ip40) +{ + return 0; +} + +u64 +lb_node_get_other_ports6(ip6_header_t *ip60) +{ + return 0; +} + +static_always_inline u32 +lb_node_get_hash(vlib_buffer_t *p, u8 is_input_v4) +{ + u32 hash; + if (is_input_v4) + { + ip4_header_t *ip40; + u64 ports; + ip40 = vlib_buffer_get_current (p); + if (PREDICT_TRUE (ip40->protocol == IP_PROTOCOL_TCP || + ip40->protocol == IP_PROTOCOL_UDP)) + ports = ((u64)((udp_header_t *)(ip40 + 1))->src_port << 16) | + ((u64)((udp_header_t *)(ip40 + 1))->dst_port); + else + ports = lb_node_get_other_ports4(ip40); + + hash = lb_hash_hash(*((u64 *)&ip40->address_pair), ports, + 0, 0, 0); + } + else + { + ip6_header_t *ip60; + ip60 = vlib_buffer_get_current (p); + u64 ports; + if (PREDICT_TRUE (ip60->protocol == IP_PROTOCOL_TCP || + ip60->protocol == IP_PROTOCOL_UDP)) + ports = ((u64)((udp_header_t *)(ip60 + 1))->src_port << 16) | + ((u64)((udp_header_t *)(ip60 + 1))->dst_port); + else + ports = lb_node_get_other_ports6(ip60); + + hash = lb_hash_hash(ip60->src_address.as_u64[0], + ip60->src_address.as_u64[1], + ip60->dst_address.as_u64[0], + ip60->dst_address.as_u64[1], + ports); + } + return hash; +} + +static_always_inline uword +lb_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame, + u8 is_input_v4, //Compile-time parameter stating that is input is v4 (or v6) + u8 is_encap_v4) //Compile-time parameter stating that is GRE encap is v4 (or v6) +{ + lb_main_t *lbm = &lb_main; + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + u32 cpu_index = os_get_cpu_number(); + u32 lb_time = lb_hash_time_now(vm); + + lb_hash_t *sticky_ht = lb_get_sticky_table(cpu_index); + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + u32 nexthash0 = 0; + if (PREDICT_TRUE(n_left_from > 0)) + nexthash0 = lb_node_get_hash(vlib_get_buffer (vm, from[0]), is_input_v4); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 pi0; + vlib_buffer_t *p0; + lb_vip_t *vip0; + u32 asindex0; + u16 len0; + u32 available_index0; + u8 counter = 0; + u32 hash0 = nexthash0; + + if (PREDICT_TRUE(n_left_from > 1)) + { + vlib_buffer_t *p1 = vlib_get_buffer (vm, from[1]); + //Compute next hash and prefetch bucket + nexthash0 = lb_node_get_hash(p1, is_input_v4); + lb_hash_prefetch_bucket(sticky_ht, nexthash0); + //Prefetch for encap, next + CLIB_PREFETCH (vlib_buffer_get_current(p1) - 64, 64, STORE); + } + + if (PREDICT_TRUE(n_left_from > 2)) + { + vlib_buffer_t *p2; + p2 = vlib_get_buffer(vm, from[2]); + /* prefetch packet header and data */ + vlib_prefetch_buffer_header(p2, STORE); + CLIB_PREFETCH (vlib_buffer_get_current(p2), 64, STORE); + } + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + vip0 = pool_elt_at_index (lbm->vips, + vnet_buffer (p0)->ip.adj_index[VLIB_TX]); + + if (is_input_v4) + { + ip4_header_t *ip40; + ip40 = vlib_buffer_get_current (p0); + len0 = clib_net_to_host_u16(ip40->length); + } + else + { + ip6_header_t *ip60; + ip60 = vlib_buffer_get_current (p0); + len0 = clib_net_to_host_u16(ip60->payload_length) + sizeof(ip6_header_t); + } + + lb_hash_get(sticky_ht, hash0, vnet_buffer (p0)->ip.adj_index[VLIB_TX], + lb_time, &available_index0, &asindex0); + + if (PREDICT_TRUE(asindex0 != ~0)) + { + //Found an existing entry + counter = LB_VIP_COUNTER_NEXT_PACKET; + } + else if (PREDICT_TRUE(available_index0 != ~0)) + { + //There is an available slot for a new flow + asindex0 = vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index; + counter = LB_VIP_COUNTER_FIRST_PACKET; + counter = (asindex0 == 0)?LB_VIP_COUNTER_NO_SERVER:counter; + + //TODO: There are race conditions with as0 and vip0 manipulation. + //Configuration may be changed, vectors resized, etc... + + //Dereference previously used + vlib_refcount_add(&lbm->as_refcount, cpu_index, + lb_hash_available_value(sticky_ht, hash0, available_index0), -1); + vlib_refcount_add(&lbm->as_refcount, cpu_index, + asindex0, 1); + + //Add sticky entry + //Note that when there is no AS configured, an entry is configured anyway. + //But no configured AS is not something that should happen + lb_hash_put(sticky_ht, hash0, asindex0, + vnet_buffer (p0)->ip.adj_index[VLIB_TX], + available_index0, lb_time); + } + else + { + //Could not store new entry in the table + asindex0 = vip0->new_flow_table[hash0 & vip0->new_flow_table_mask].as_index; + counter = LB_VIP_COUNTER_UNTRACKED_PACKET; + } + + vlib_increment_simple_counter(&lbm->vip_counters[counter], + cpu_index, + vnet_buffer (p0)->ip.adj_index[VLIB_TX], + 1); + + //Now let's encap + { + gre_header_t *gre0; + if (is_encap_v4) + { + ip4_header_t *ip40; + vlib_buffer_advance(p0, - sizeof(ip4_header_t) - sizeof(gre_header_t)); + ip40 = vlib_buffer_get_current(p0); + gre0 = (gre_header_t *)(ip40 + 1); + ip40->src_address = lbm->ip4_src_address; + ip40->dst_address = lbm->ass[asindex0].address.ip4; + ip40->ip_version_and_header_length = 0x45; + ip40->ttl = 128; + ip40->length = clib_host_to_net_u16(len0 + sizeof(gre_header_t) + sizeof(ip4_header_t)); + ip40->protocol = IP_PROTOCOL_GRE; + ip40->checksum = ip4_header_checksum (ip40); + } + else + { + ip6_header_t *ip60; + vlib_buffer_advance(p0, - sizeof(ip6_header_t) - sizeof(gre_header_t)); + ip60 = vlib_buffer_get_current(p0); + gre0 = (gre_header_t *)(ip60 + 1); + ip60->dst_address = lbm->ass[asindex0].address.ip6; + ip60->src_address = lbm->ip6_src_address; + ip60->hop_limit = 128; + ip60->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6<<28); + ip60->payload_length = clib_host_to_net_u16(len0 + sizeof(gre_header_t)); + ip60->protocol = IP_PROTOCOL_GRE; + } + + gre0->flags_and_version = 0; + gre0->protocol = (is_input_v4)? + clib_host_to_net_u16(0x0800): + clib_host_to_net_u16(0x86DD); + } + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + lb_trace_t *tr = vlib_add_trace (vm, node, p0, sizeof (*tr)); + tr->as_index = asindex0; + tr->vip_index = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + } + + //Enqueue to next + //Note that this is going to error if asindex0 == 0 + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = lbm->ass[asindex0].dpo.dpoi_index; + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, pi0, + lbm->ass[asindex0].dpo.dpoi_next_node); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static uword +lb6_gre6_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return lb_node_fn(vm, node, frame, 0, 0); +} + +static uword +lb6_gre4_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return lb_node_fn(vm, node, frame, 0, 1); +} + +static uword +lb4_gre6_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return lb_node_fn(vm, node, frame, 1, 0); +} + +static uword +lb4_gre4_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, vlib_frame_t * frame) +{ + return lb_node_fn(vm, node, frame, 1, 1); +} + +VLIB_REGISTER_NODE (lb6_gre6_node) = +{ + .function = lb6_gre6_node_fn, + .name = "lb6-gre6", + .vector_size = sizeof (u32), + .format_trace = format_lb_trace, + + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { + [LB_NEXT_DROP] = "error-drop" + }, +}; + +VLIB_REGISTER_NODE (lb6_gre4_node) = +{ + .function = lb6_gre4_node_fn, + .name = "lb6-gre4", + .vector_size = sizeof (u32), + .format_trace = format_lb_trace, + + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { + [LB_NEXT_DROP] = "error-drop" + }, +}; + +VLIB_REGISTER_NODE (lb4_gre6_node) = +{ + .function = lb4_gre6_node_fn, + .name = "lb4-gre6", + .vector_size = sizeof (u32), + .format_trace = format_lb_trace, + + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { + [LB_NEXT_DROP] = "error-drop" + }, +}; + +VLIB_REGISTER_NODE (lb4_gre4_node) = +{ + .function = lb4_gre4_node_fn, + .name = "lb4-gre4", + .vector_size = sizeof (u32), + .format_trace = format_lb_trace, + + .n_errors = LB_N_ERROR, + .error_strings = lb_error_strings, + + .n_next_nodes = LB_N_NEXT, + .next_nodes = + { + [LB_NEXT_DROP] = "error-drop" + }, +}; + diff --git a/src/plugins/lb/refcount.c b/src/plugins/lb/refcount.c new file mode 100644 index 00000000000..22415c8889e --- /dev/null +++ b/src/plugins/lb/refcount.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/refcount.h> + +void __vlib_refcount_resize(vlib_refcount_per_cpu_t *per_cpu, u32 size) +{ + u32 *new_counter = 0, *old_counter; + vec_validate(new_counter, size); + memcpy(new_counter, per_cpu->counters, per_cpu->length); + old_counter = per_cpu->counters; + per_cpu->counters = new_counter; + CLIB_MEMORY_BARRIER(); + per_cpu->length = vec_len(new_counter); + vec_free(old_counter); +} + +u64 vlib_refcount_get(vlib_refcount_t *r, u32 index) +{ + u64 count = 0; + vlib_thread_main_t *tm = vlib_get_thread_main (); + u32 cpu_index; + for (cpu_index = 0; cpu_index < tm->n_vlib_mains; cpu_index++) { + if (r->per_cpu[cpu_index].length > index) + count += r->per_cpu[cpu_index].counters[index]; + } + return count; +} + diff --git a/src/plugins/lb/refcount.h b/src/plugins/lb/refcount.h new file mode 100644 index 00000000000..8c26e7be76f --- /dev/null +++ b/src/plugins/lb/refcount.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * vlib provides lock-free counters but those + * - Have 16bits per-CPU counter, which may overflow. + * - Would only increment. + * + * This is very similar to vlib counters, but may be used to count reference. + * Such a counter includes an arbitrary number of counters. Each counter + * is identified by its index. This is used to aggregate per-cpu memory. + * + * Warning: + * This reference counter is lock-free but is not race-condition free. + * The counting result is approximate and another mechanism needs to be used + * in order to ensure that an object may be freed. + * + */ + +#include <vnet/vnet.h> + +typedef struct { + u32 *counters; + u32 length; + u32 *reader_lengths; + CLIB_CACHE_LINE_ALIGN_MARK(o); +} vlib_refcount_per_cpu_t; + +typedef struct { + vlib_refcount_per_cpu_t *per_cpu; +} vlib_refcount_t; + +void __vlib_refcount_resize(vlib_refcount_per_cpu_t *per_cpu, u32 size); + +static_always_inline +void vlib_refcount_add(vlib_refcount_t *r, u32 cpu_index, u32 counter_index, i32 v) +{ + vlib_refcount_per_cpu_t *per_cpu = &r->per_cpu[cpu_index]; + if (PREDICT_FALSE(counter_index >= per_cpu->length)) + __vlib_refcount_resize(per_cpu, clib_max(counter_index + 16, per_cpu->length * 2)); + + per_cpu->counters[counter_index] += v; +} + +u64 vlib_refcount_get(vlib_refcount_t *r, u32 index); + +static_always_inline +void vlib_refcount_init(vlib_refcount_t *r) +{ + vlib_thread_main_t *tm = vlib_get_thread_main (); + r->per_cpu = 0; + vec_validate (r->per_cpu, tm->n_vlib_mains - 1); +} + + diff --git a/src/plugins/lb/util.c b/src/plugins/lb/util.c new file mode 100644 index 00000000000..d969d168dce --- /dev/null +++ b/src/plugins/lb/util.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <lb/util.h> + +void ip46_prefix_normalize(ip46_address_t *prefix, u8 plen) +{ + if (plen == 0) { + prefix->as_u64[0] = 0; + prefix->as_u64[1] = 0; + } else if (plen <= 64) { + prefix->as_u64[0] &= clib_host_to_net_u64(0xffffffffffffffffL << (64 - plen)); + prefix->as_u64[1] = 0; + } else { + prefix->as_u64[1] &= clib_host_to_net_u64(0xffffffffffffffffL << (128 - plen)); + } + +} + +uword unformat_ip46_prefix (unformat_input_t * input, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + u8 *len = va_arg (*args, u8 *); + ip46_type_t type = va_arg (*args, ip46_type_t); + + u32 l; + if ((type != IP46_TYPE_IP6) && unformat(input, "%U/%u", unformat_ip4_address, &ip46->ip4, &l)) { + if (l > 32) + return 0; + *len = l + 96; + ip46->pad[0] = ip46->pad[1] = ip46->pad[2] = 0; + } else if ((type != IP46_TYPE_IP4) && unformat(input, "%U/%u", unformat_ip6_address, &ip46->ip6, &l)) { + if (l > 128) + return 0; + *len = l; + } else { + return 0; + } + return 1; +} + +u8 *format_ip46_prefix (u8 * s, va_list * args) +{ + ip46_address_t *ip46 = va_arg (*args, ip46_address_t *); + u32 len = va_arg (*args, u32); //va_arg cannot use u8 or u16 + ip46_type_t type = va_arg (*args, ip46_type_t); + + int is_ip4 = 0; + if (type == IP46_TYPE_IP4) + is_ip4 = 1; + else if (type == IP46_TYPE_IP6) + is_ip4 = 0; + else + is_ip4 = (len >= 96) && ip46_address_is_ip4(ip46); + + return is_ip4 ? + format(s, "%U/%d", format_ip4_address, &ip46->ip4, len - 96): + format(s, "%U/%d", format_ip6_address, &ip46->ip6, len); +} + diff --git a/src/plugins/lb/util.h b/src/plugins/lb/util.h new file mode 100644 index 00000000000..3f082310b69 --- /dev/null +++ b/src/plugins/lb/util.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Non-LB specific stuff comes here + */ + +#ifndef LB_PLUGIN_LB_UTIL_H_ +#define LB_PLUGIN_LB_UTIL_H_ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> + +#define ip46_address_type(ip46) (ip46_address_is_ip4(ip46)?IP46_TYPE_IP4:IP46_TYPE_IP6) +#define ip46_prefix_is_ip4(ip46, len) ((len) >= 96 && ip46_address_is_ip4(ip46)) +#define ip46_prefix_type(ip46, len) (ip46_prefix_is_ip4(ip46, len)?IP46_TYPE_IP4:IP46_TYPE_IP6) + +void ip46_prefix_normalize(ip46_address_t *prefix, u8 plen); +uword unformat_ip46_prefix (unformat_input_t * input, va_list * args); +u8 *format_ip46_prefix (u8 * s, va_list * args); + +/** + * 32 bits integer comparison for running values. + * 1 > 0 is true. But 1 > 0xffffffff also is. + */ +#define clib_u32_loop_gt(a, b) (((u32)(a)) - ((u32)(b)) < 0x7fffffff) + +#endif /* LB_PLUGIN_LB_UTIL_H_ */ |