diff options
Diffstat (limited to 'src/vnet/dpo')
-rw-r--r-- | src/vnet/dpo/classify_dpo.c | 131 | ||||
-rw-r--r-- | src/vnet/dpo/classify_dpo.h | 56 | ||||
-rw-r--r-- | src/vnet/dpo/dpo.c | 500 | ||||
-rw-r--r-- | src/vnet/dpo/dpo.h | 381 | ||||
-rw-r--r-- | src/vnet/dpo/drop_dpo.c | 106 | ||||
-rw-r--r-- | src/vnet/dpo/drop_dpo.h | 31 | ||||
-rw-r--r-- | src/vnet/dpo/ip_null_dpo.c | 408 | ||||
-rw-r--r-- | src/vnet/dpo/ip_null_dpo.h | 56 | ||||
-rw-r--r-- | src/vnet/dpo/load_balance.c | 993 | ||||
-rw-r--r-- | src/vnet/dpo/load_balance.h | 211 | ||||
-rw-r--r-- | src/vnet/dpo/load_balance_map.c | 575 | ||||
-rw-r--r-- | src/vnet/dpo/load_balance_map.h | 79 | ||||
-rw-r--r-- | src/vnet/dpo/lookup_dpo.c | 1185 | ||||
-rw-r--r-- | src/vnet/dpo/lookup_dpo.h | 108 | ||||
-rw-r--r-- | src/vnet/dpo/mpls_label_dpo.c | 570 | ||||
-rw-r--r-- | src/vnet/dpo/mpls_label_dpo.h | 101 | ||||
-rw-r--r-- | src/vnet/dpo/punt_dpo.c | 100 | ||||
-rw-r--r-- | src/vnet/dpo/punt_dpo.h | 30 | ||||
-rw-r--r-- | src/vnet/dpo/receive_dpo.c | 165 | ||||
-rw-r--r-- | src/vnet/dpo/receive_dpo.h | 62 |
20 files changed, 5848 insertions, 0 deletions
diff --git a/src/vnet/dpo/classify_dpo.c b/src/vnet/dpo/classify_dpo.c new file mode 100644 index 00000000000..9e7886c9edd --- /dev/null +++ b/src/vnet/dpo/classify_dpo.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/ip.h> +#include <vnet/dpo/classify_dpo.h> +#include <vnet/mpls/mpls.h> + +/* + * pool of all MPLS Label DPOs + */ +classify_dpo_t *classify_dpo_pool; + +static classify_dpo_t * +classify_dpo_alloc (void) +{ + classify_dpo_t *cd; + + pool_get_aligned(classify_dpo_pool, cd, CLIB_CACHE_LINE_BYTES); + memset(cd, 0, sizeof(*cd)); + + return (cd); +} + +static index_t +classify_dpo_get_index (classify_dpo_t *cd) +{ + return (cd - classify_dpo_pool); +} + +index_t +classify_dpo_create (dpo_proto_t proto, + u32 classify_table_index) +{ + classify_dpo_t *cd; + + cd = classify_dpo_alloc(); + cd->cd_proto = proto; + cd->cd_table_index = classify_table_index; + + return (classify_dpo_get_index(cd)); +} + +u8* +format_classify_dpo (u8 *s, va_list *args) +{ + index_t index = va_arg (*args, index_t); + CLIB_UNUSED(u32 indent) = va_arg (*args, u32); + classify_dpo_t *cd; + + cd = classify_dpo_get(index); + + return (format(s, "%U-classify:[%d]:table:%d", + format_dpo_proto, cd->cd_proto, + index, cd->cd_table_index)); +} + +static void +classify_dpo_lock (dpo_id_t *dpo) +{ + classify_dpo_t *cd; + + cd = classify_dpo_get(dpo->dpoi_index); + + cd->cd_locks++; +} + +static void +classify_dpo_unlock (dpo_id_t *dpo) +{ + classify_dpo_t *cd; + + cd = classify_dpo_get(dpo->dpoi_index); + + cd->cd_locks--; + + if (0 == cd->cd_locks) + { + pool_put(classify_dpo_pool, cd); + } +} + +static void +classify_dpo_mem_show (void) +{ + fib_show_memory_usage("Classify", + pool_elts(classify_dpo_pool), + pool_len(classify_dpo_pool), + sizeof(classify_dpo_t)); +} + +const static dpo_vft_t cd_vft = { + .dv_lock = classify_dpo_lock, + .dv_unlock = classify_dpo_unlock, + .dv_format = format_classify_dpo, + .dv_mem_show = classify_dpo_mem_show, +}; + +const static char* const classify_ip4_nodes[] = +{ + "ip4-classify", + NULL, +}; +const static char* const classify_ip6_nodes[] = +{ + "ip6-classify", + NULL, +}; +const static char* const * const classify_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = classify_ip4_nodes, + [DPO_PROTO_IP6] = classify_ip6_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + +void +classify_dpo_module_init (void) +{ + dpo_register(DPO_CLASSIFY, &cd_vft, classify_nodes); +} diff --git a/src/vnet/dpo/classify_dpo.h b/src/vnet/dpo/classify_dpo.h new file mode 100644 index 00000000000..48f4b2bf8a5 --- /dev/null +++ b/src/vnet/dpo/classify_dpo.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CLASSIFY_DPO_H__ +#define __CLASSIFY_DPO_H__ + +#include <vnet/vnet.h> +#include <vnet/mpls/packet.h> +#include <vnet/dpo/dpo.h> + +/** + * A representation of an MPLS label for imposition in the data-path + */ +typedef struct classify_dpo_t +{ + dpo_proto_t cd_proto; + + u32 cd_table_index; + + /** + * Number of locks/users of the label + */ + u16 cd_locks; +} classify_dpo_t; + +extern index_t classify_dpo_create(dpo_proto_t proto, + u32 classify_table_index); + +extern u8* format_classify_dpo(u8 *s, va_list *args); + +/* + * Encapsulation violation for fast data-path access + */ +extern classify_dpo_t *classify_dpo_pool; + +static inline classify_dpo_t * +classify_dpo_get (index_t index) +{ + return (pool_elt_at_index(classify_dpo_pool, index)); +} + +extern void classify_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/dpo.c b/src/vnet/dpo/dpo.c new file mode 100644 index 00000000000..688d2892412 --- /dev/null +++ b/src/vnet/dpo/dpo.c @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * A Data-Path Object is an object that represents actions that are + * applied to packets are they are switched through VPP. + * + * The DPO is a base class that is specialised by other objects to provide + * concreate actions + * + * The VLIB graph nodes are graph of types, the DPO graph is a graph of instances. + */ + +#include <vnet/dpo/dpo.h> +#include <vnet/ip/lookup.h> +#include <vnet/ip/format.h> +#include <vnet/adj/adj.h> + +#include <vnet/dpo/load_balance.h> +#include <vnet/dpo/mpls_label_dpo.h> +#include <vnet/dpo/lookup_dpo.h> +#include <vnet/dpo/drop_dpo.h> +#include <vnet/dpo/receive_dpo.h> +#include <vnet/dpo/punt_dpo.h> +#include <vnet/dpo/classify_dpo.h> +#include <vnet/dpo/ip_null_dpo.h> + +/** + * Array of char* names for the DPO types and protos + */ +static const char* dpo_type_names[] = DPO_TYPES; +static const char* dpo_proto_names[] = DPO_PROTOS; + +/** + * @brief Vector of virtual function tables for the DPO types + * + * This is a vector so we can dynamically register new DPO types in plugins. + */ +static dpo_vft_t *dpo_vfts; + +/** + * @brief vector of graph node names associated with each DPO type and protocol. + * + * dpo_nodes[child_type][child_proto][node_X] = node_name; + * i.e. + * dpo_node[DPO_LOAD_BALANCE][DPO_PROTO_IP4][0] = "ip4-lookup" + * dpo_node[DPO_LOAD_BALANCE][DPO_PROTO_IP4][1] = "ip4-load-balance" + * + * This is a vector so we can dynamically register new DPO types in plugins. + */ +static const char* const * const ** dpo_nodes; + +/** + * @brief Vector of edge indicies from parent DPO nodes to child + * + * dpo_edges[child_type][child_proto][parent_type][parent_proto] = edge_index + * + * This array is derived at init time from the dpo_nodes above. Note that + * the third dimension in dpo_nodes is lost, hence, the edge index from each + * node MUST be the same. + * Including both the child and parent protocol is required to support the + * case where it changes as the grapth is traversed, most notablly when an + * MPLS label is popped. + * + * Note that this array is child type specific, not child instance specific. + */ +static u32 ****dpo_edges; + +/** + * @brief The DPO type value that can be assigend to the next dynamic + * type registration. + */ +static dpo_type_t dpo_dynamic = DPO_LAST; + +dpo_proto_t +vnet_link_to_dpo_proto (vnet_link_t linkt) +{ + switch (linkt) + { + case VNET_LINK_IP6: + return (DPO_PROTO_IP6); + case VNET_LINK_IP4: + return (DPO_PROTO_IP4); + case VNET_LINK_MPLS: + return (DPO_PROTO_MPLS); + case VNET_LINK_ETHERNET: + return (DPO_PROTO_ETHERNET); + case VNET_LINK_ARP: + break; + } + ASSERT(0); + return (0); +} + +u8 * +format_dpo_type (u8 * s, va_list * args) +{ + dpo_type_t type = va_arg (*args, int); + + s = format(s, "%s", dpo_type_names[type]); + + return (s); +} + +u8 * +format_dpo_id (u8 * s, va_list * args) +{ + dpo_id_t *dpo = va_arg (*args, dpo_id_t*); + u32 indent = va_arg (*args, u32); + + s = format(s, "[@%d]: ", dpo->dpoi_next_node); + + if (NULL != dpo_vfts[dpo->dpoi_type].dv_format) + { + return (format(s, "%U", + dpo_vfts[dpo->dpoi_type].dv_format, + dpo->dpoi_index, + indent)); + } + + switch (dpo->dpoi_type) + { + case DPO_FIRST: + s = format(s, "unset"); + break; + default: + s = format(s, "unknown"); + break; + } + return (s); +} + +u8 * +format_dpo_proto (u8 * s, va_list * args) +{ + dpo_proto_t proto = va_arg (*args, int); + + return (format(s, "%s", dpo_proto_names[proto])); +} + +void +dpo_set (dpo_id_t *dpo, + dpo_type_t type, + dpo_proto_t proto, + index_t index) +{ + dpo_id_t tmp = *dpo; + + dpo->dpoi_type = type; + dpo->dpoi_proto = proto, + dpo->dpoi_index = index; + + if (DPO_ADJACENCY == type) + { + /* + * set the adj subtype + */ + ip_adjacency_t *adj; + + adj = adj_get(index); + + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_ARP: + dpo->dpoi_type = DPO_ADJACENCY_INCOMPLETE; + break; + case IP_LOOKUP_NEXT_MIDCHAIN: + dpo->dpoi_type = DPO_ADJACENCY_MIDCHAIN; + break; + default: + break; + } + } + dpo_lock(dpo); + dpo_unlock(&tmp); +} + +void +dpo_reset (dpo_id_t *dpo) +{ + dpo_id_t tmp = DPO_INVALID; + + /* + * use the atomic copy operation. + */ + dpo_copy(dpo, &tmp); +} + +/** + * \brief + * Compare two Data-path objects + * + * like memcmp, return 0 is matching, !0 otherwise. + */ +int +dpo_cmp (const dpo_id_t *dpo1, + const dpo_id_t *dpo2) +{ + int res; + + res = dpo1->dpoi_type - dpo2->dpoi_type; + + if (0 != res) return (res); + + return (dpo1->dpoi_index - dpo2->dpoi_index); +} + +void +dpo_copy (dpo_id_t *dst, + const dpo_id_t *src) +{ + dpo_id_t tmp = *dst; + + /* + * the destination is written in a single u64 write - hence atomically w.r.t + * any packets inflight. + */ + *((u64*)dst) = *(u64*)src; + + dpo_lock(dst); + dpo_unlock(&tmp); +} + +int +dpo_is_adj (const dpo_id_t *dpo) +{ + return ((dpo->dpoi_type == DPO_ADJACENCY) || + (dpo->dpoi_type == DPO_ADJACENCY_INCOMPLETE) || + (dpo->dpoi_type == DPO_ADJACENCY_MIDCHAIN) || + (dpo->dpoi_type == DPO_ADJACENCY_GLEAN)); +} + +void +dpo_register (dpo_type_t type, + const dpo_vft_t *vft, + const char * const * const * nodes) +{ + vec_validate(dpo_vfts, type); + dpo_vfts[type] = *vft; + + vec_validate(dpo_nodes, type); + dpo_nodes[type] = nodes; +} + +dpo_type_t +dpo_register_new_type (const dpo_vft_t *vft, + const char * const * const * nodes) +{ + dpo_type_t type = dpo_dynamic++; + + dpo_register(type, vft, nodes); + + return (type); +} + +void +dpo_lock (dpo_id_t *dpo) +{ + if (!dpo_id_is_valid(dpo)) + return; + + dpo_vfts[dpo->dpoi_type].dv_lock(dpo); +} + +void +dpo_unlock (dpo_id_t *dpo) +{ + if (!dpo_id_is_valid(dpo)) + return; + + dpo_vfts[dpo->dpoi_type].dv_unlock(dpo); +} + + +static u32 +dpo_get_next_node (dpo_type_t child_type, + dpo_proto_t child_proto, + const dpo_id_t *parent_dpo) +{ + dpo_proto_t parent_proto; + dpo_type_t parent_type; + + parent_type = parent_dpo->dpoi_type; + parent_proto = parent_dpo->dpoi_proto; + + vec_validate(dpo_edges, child_type); + vec_validate(dpo_edges[child_type], child_proto); + vec_validate(dpo_edges[child_type][child_proto], parent_type); + vec_validate_init_empty( + dpo_edges[child_type][child_proto][parent_type], + parent_proto, ~0); + + /* + * if the edge index has not yet been created for this node to node transistion + */ + if (~0 == dpo_edges[child_type][child_proto][parent_type][parent_proto]) + { + vlib_node_t *parent_node, *child_node; + vlib_main_t *vm; + u32 edge ,pp, cc; + + vm = vlib_get_main(); + + ASSERT(NULL != dpo_nodes[child_type]); + ASSERT(NULL != dpo_nodes[child_type][child_proto]); + ASSERT(NULL != dpo_nodes[parent_type]); + ASSERT(NULL != dpo_nodes[parent_type][parent_proto]); + + cc = 0; + + /* + * create a graph arc from each of the parent's registered node types, + * to each of the childs. + */ + while (NULL != dpo_nodes[child_type][child_proto][cc]) + { + child_node = + vlib_get_node_by_name(vm, + (u8*) dpo_nodes[child_type][child_proto][cc]); + + pp = 0; + + while (NULL != dpo_nodes[parent_type][parent_proto][pp]) + { + parent_node = + vlib_get_node_by_name(vm, + (u8*) dpo_nodes[parent_type][parent_proto][pp]); + + edge = vlib_node_add_next(vm, + child_node->index, + parent_node->index); + + if (~0 == dpo_edges[child_type][child_proto][parent_type][parent_proto]) + { + dpo_edges[child_type][child_proto][parent_type][parent_proto] = edge; + } + else + { + ASSERT(dpo_edges[child_type][child_proto][parent_type][parent_proto] == edge); + } + pp++; + } + cc++; + } + } + + return (dpo_edges[child_type][child_proto][parent_type][parent_proto]); +} + +/** + * @brief Stack one DPO object on another, and thus establish a child parent + * relationship. The VLIB graph arc used is taken from the parent and child types + * passed. + */ +static void +dpo_stack_i (u32 edge, + dpo_id_t *dpo, + const dpo_id_t *parent) +{ + /* + * in order to get an atomic update of the parent we create a temporary, + * from a copy of the child, and add the next_node. then we copy to the parent + */ + dpo_id_t tmp = DPO_INVALID; + dpo_copy(&tmp, parent); + + /* + * get the edge index for the parent to child VLIB graph transisition + */ + tmp.dpoi_next_node = edge; + + /* + * this update is atomic. + */ + dpo_copy(dpo, &tmp); + + dpo_reset(&tmp); +} + +/** + * @brief Stack one DPO object on another, and thus establish a child-parent + * relationship. The VLIB graph arc used is taken from the parent and child types + * passed. + */ +void +dpo_stack (dpo_type_t child_type, + dpo_proto_t child_proto, + dpo_id_t *dpo, + const dpo_id_t *parent) +{ + dpo_stack_i(dpo_get_next_node(child_type, child_proto, parent), dpo, parent); +} + +/** + * @brief Stack one DPO object on another, and thus establish a child parent + * relationship. A new VLIB graph arc is created from the child node passed + * to the nodes registered by the parent. The VLIB infra will ensure this arc + * is added only once. + */ +void +dpo_stack_from_node (u32 child_node_index, + dpo_id_t *dpo, + const dpo_id_t *parent) +{ + dpo_proto_t parent_proto; + vlib_node_t *parent_node; + dpo_type_t parent_type; + vlib_main_t *vm; + u32 edge; + + parent_type = parent->dpoi_type; + parent_proto = parent->dpoi_proto; + + vm = vlib_get_main(); + + ASSERT(NULL != dpo_nodes[parent_type]); + ASSERT(NULL != dpo_nodes[parent_type][parent_proto]); + + parent_node = + vlib_get_node_by_name(vm, (u8*) dpo_nodes[parent_type][parent_proto][0]); + + edge = vlib_node_add_next(vm, + child_node_index, + parent_node->index); + + dpo_stack_i(edge, dpo, parent); +} + +static clib_error_t * +dpo_module_init (vlib_main_t * vm) +{ + drop_dpo_module_init(); + punt_dpo_module_init(); + receive_dpo_module_init(); + load_balance_module_init(); + mpls_label_dpo_module_init(); + classify_dpo_module_init(); + lookup_dpo_module_init(); + ip_null_dpo_module_init(); + + return (NULL); +} + +VLIB_INIT_FUNCTION(dpo_module_init); + +static clib_error_t * +dpo_memory_show (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + dpo_vft_t *vft; + + vlib_cli_output (vm, "DPO memory"); + vlib_cli_output (vm, "%=30s %=5s %=8s/%=9s totals", + "Name","Size", "in-use", "allocated"); + + vec_foreach(vft, dpo_vfts) + { + if (NULL != vft->dv_mem_show) + vft->dv_mem_show(); + } + + return (NULL); +} + +/* *INDENT-OFF* */ +/*? + * The '<em>sh dpo memory </em>' command displays the memory usage for each + * data-plane object type. + * + * @cliexpar + * @cliexstart{show dpo memory} + * DPO memory + * Name Size in-use /allocated totals + * load-balance 64 12 / 12 768/768 + * Adjacency 256 1 / 1 256/256 + * Receive 24 5 / 5 120/120 + * Lookup 12 0 / 0 0/0 + * Classify 12 0 / 0 0/0 + * MPLS label 24 0 / 0 0/0 + * @cliexend +?*/ +VLIB_CLI_COMMAND (show_fib_memory, static) = { + .path = "show dpo memory", + .function = dpo_memory_show, + .short_help = "show dpo memory", +}; +/* *INDENT-ON* */ diff --git a/src/vnet/dpo/dpo.h b/src/vnet/dpo/dpo.h new file mode 100644 index 00000000000..1efcbc8834b --- /dev/null +++ b/src/vnet/dpo/dpo.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * A Data-Path Object is an object that represents actions that are + * applied to packets are they are switched through VPP's data-path. + * + * The DPO can be considered to be like is a base class that is specialised + * by other objects to provide concreate actions + * + * The VLIB graph nodes are graph of DPO types, the DPO graph is a graph of + * instances. + */ + +#ifndef __DPO_H__ +#define __DPO_H__ + +#include <vnet/vnet.h> + +/** + * @brief An index for adjacencies. + * Alas 'C' is not typesafe enough to b0rk when a u32 is used instead of + * an index_t. However, for us humans, we can glean much more intent + * from the declaration + * foo barindex_t t); + * than we can from + * foo bar(u32 t); + */ +typedef u32 index_t; + +/** + * @brief Invalid index - used when no index is known + * blazoned capitals INVALID speak volumes where ~0 does not. + */ +#define INDEX_INVALID ((index_t)(~0)) + +/** + * @brief Data path protocol. + * Actions performed on packets in the data-plane can be described and represented + * by protocol independent objects, i.e. ADJACENCY, but the spceifics actions + * required during ADJACENCY processing can be protocol dependent. For example, + * the adjacency rewrite node performs a ip4 checksum calculation, ip6 and MPLS + * do not, all 3 perform a TTL decrement. The VLIB graph nodes are thus protocol + * dependent, and thus each graph edge/arc is too. + * When programming a DPO's next node arc from child to parent it is thus required + * to know the parent's data-path protocol so the correct arc index can be used. + */ +typedef enum dpo_proto_t_ +{ +#if CLIB_DEBUG > 0 + DPO_PROTO_IP4 = 1, +#else + DPO_PROTO_IP4 = 0, +#endif + DPO_PROTO_IP6, + DPO_PROTO_ETHERNET, + DPO_PROTO_MPLS, +} __attribute__((packed)) dpo_proto_t; + +#define DPO_PROTO_NUM ((dpo_proto_t)(DPO_PROTO_MPLS+1)) +#define DPO_PROTO_NONE ((dpo_proto_t)(DPO_PROTO_NUM+1)) + +#define DPO_PROTOS { \ + [DPO_PROTO_IP4] = "ip4", \ + [DPO_PROTO_IP6] = "ip6", \ + [DPO_PROTO_ETHERNET] = "ethernet", \ + [DPO_PROTO_MPLS] = "mpls", \ +} + +#define FOR_EACH_DPO_PROTO(_proto) \ + for (_proto = DPO_PROTO_IP4; \ + _proto <= DPO_PROTO_MPLS; \ + _proto++) + +/** + * @brief Common types of data-path objects + * New types can be dynamically added using dpo_register_new_type() + */ +typedef enum dpo_type_t_ { + /** + * A non-zero value first so we can spot unitialisation errors + */ + DPO_FIRST, + DPO_DROP, + DPO_IP_NULL, + DPO_PUNT, + /** + * @brief load-balancing over a choice of [un]equal cost paths + */ + DPO_LOAD_BALANCE, + DPO_ADJACENCY, + DPO_ADJACENCY_INCOMPLETE, + DPO_ADJACENCY_MIDCHAIN, + DPO_ADJACENCY_GLEAN, + DPO_RECEIVE, + DPO_LOOKUP, + DPO_LISP_CP, + DPO_CLASSIFY, + DPO_MPLS_LABEL, + DPO_LAST, +} __attribute__((packed)) dpo_type_t; + +#define DPO_TYPE_NUM DPO_LAST + +#define DPO_TYPES { \ + [DPO_FIRST] = "dpo-invalid", \ + [DPO_DROP] = "dpo-drop", \ + [DPO_IP_NULL] = "dpo-ip-null", \ + [DPO_PUNT] = "dpo-punt", \ + [DPO_ADJACENCY] = "dpo-adjacency", \ + [DPO_ADJACENCY_INCOMPLETE] = "dpo-adjacency-incomplete", \ + [DPO_ADJACENCY_MIDCHAIN] = "dpo-adjacency-midcahin", \ + [DPO_ADJACENCY_GLEAN] = "dpo-glean", \ + [DPO_RECEIVE] = "dpo-receive", \ + [DPO_LOOKUP] = "dpo-lookup", \ + [DPO_LOAD_BALANCE] = "dpo-load-balance", \ + [DPO_LISP_CP] = "dpo-lisp-cp", \ + [DPO_CLASSIFY] = "dpo-classify", \ + [DPO_MPLS_LABEL] = "dpo-mpls-label" \ +} + +/** + * @brief The identity of a DPO is a combination of its type and its + * instance number/index of objects of that type + */ +typedef struct dpo_id_t_ { + /** + * the type + */ + dpo_type_t dpoi_type; + /** + * the data-path protocol of the type. + */ + dpo_proto_t dpoi_proto; + /** + * The next VLIB node to follow. + */ + u16 dpoi_next_node; + /** + * the index of objects of that type + */ + index_t dpoi_index; +} __attribute__ ((aligned(sizeof(u64)))) dpo_id_t; + +STATIC_ASSERT(sizeof(dpo_id_t) <= sizeof(u64), + "DPO ID is greater than sizeof u64 " + "atomic updates need to be revisited"); + +/** + * @brief An initialiser for DPOs declared on the stack. + * Thenext node is set to 0 since VLIB graph nodes should set 0 index to drop. + */ +#define DPO_INVALID \ +{ \ + .dpoi_type = DPO_FIRST, \ + .dpoi_proto = DPO_PROTO_NONE, \ + .dpoi_index = INDEX_INVALID, \ + .dpoi_next_node = 0, \ +} + +/** + * @brief Return true if the DPO object is valid, i.e. has been initialised. + */ +static inline int +dpo_id_is_valid (const dpo_id_t *dpoi) +{ + return (dpoi->dpoi_type != DPO_FIRST && + dpoi->dpoi_index != INDEX_INVALID); +} + +extern dpo_proto_t vnet_link_to_dpo_proto(vnet_link_t linkt); + +/** + * @brief + * Take a reference counting lock on the DPO + */ +extern void dpo_lock(dpo_id_t *dpo); + +/** + * @brief + * Release a reference counting lock on the DPO + */ +extern void dpo_unlock(dpo_id_t *dpo); + +/** + * @brief Set/create a DPO ID + * The DPO will be locked. + * + * @param dpo + * The DPO object to configure + * + * @param type + * The dpo_type_t of the DPO + * + * @param proto + * The dpo_proto_t of the DPO + * + * @param index + * The type specific index of the DPO + */ +extern void dpo_set(dpo_id_t *dpo, + dpo_type_t type, + dpo_proto_t proto, + index_t index); + +/** + * @brief reset a DPO ID + * The DPO will be unlocked. + * + * @param dpo + * The DPO object to reset + */ +extern void dpo_reset(dpo_id_t *dpo); + +/** + * @brief compare two DPOs for equality + */ +extern int dpo_cmp(const dpo_id_t *dpo1, + const dpo_id_t *dpo2); + +/** + * @brief + * atomic copy a data-plane object. + * This is safe to use when the dst DPO is currently switching packets + */ +extern void dpo_copy(dpo_id_t *dst, + const dpo_id_t *src); + +/** + * @brief Return TRUE is the DPO is any type of adjacency + */ +extern int dpo_is_adj(const dpo_id_t *dpo); + +/** + * @biref Format a DPO_id_t oject + */ +extern u8 *format_dpo_id(u8 * s, va_list * args); + +/** + * @biref format a DPO type + */ +extern u8 *format_dpo_type(u8 * s, va_list * args); + +/** + * @brief format a DPO protocol + */ +extern u8 *format_dpo_proto(u8 * s, va_list * args); + +/** + * @brief + * Set and stack a DPO. + * The DPO passed is set to the parent DPO and the necessary + * VLIB graph arcs are created. The child_type and child_proto + * are used to get the VLID nodes from which the arcs are added. + * + * @param child_type + * Child DPO type. + * + * @param child_proto + * Child DPO proto + * + * @parem dpo + * This is the DPO to stack and set. + * + * @paren parent_dpo + * The parent DPO to stack onto. + */ +extern void dpo_stack(dpo_type_t child_type, + dpo_proto_t child_proto, + dpo_id_t *dpo, + const dpo_id_t *parent_dpo); + +/** + * @brief + * Set and stack a DPO. + * The DPO passed is set to the parent DPO and the necessary + * VLIB graph arcs are created, from the child_node passed. + * + * @param child_node + * The VLIB grpah node index to create an arc from to the parent + * + * @parem dpo + * This is the DPO to stack and set. + * + * @paren parent_dpo + * The parent DPO to stack onto. + */ +extern void dpo_stack_from_node(u32 child_node, + dpo_id_t *dpo, + const dpo_id_t *parent); + +/** + * @brief A lock function registered for a DPO type + */ +typedef void (*dpo_lock_fn_t)(dpo_id_t *dpo); + +/** + * @brief An unlock function registered for a DPO type + */ +typedef void (*dpo_unlock_fn_t)(dpo_id_t *dpo); + +/** + * @brief An memory usage show command + */ +typedef void (*dpo_mem_show_t)(void); + +/** + * @brief A virtual function table regisitered for a DPO type + */ +typedef struct dpo_vft_t_ +{ + /** + * A reference counting lock function + */ + dpo_lock_fn_t dv_lock; + /** + * A reference counting unlock function + */ + dpo_lock_fn_t dv_unlock; + /** + * A format function + */ + format_function_t *dv_format; + /** + * A show memory usage function + */ + dpo_mem_show_t dv_mem_show; +} dpo_vft_t; + + +/** + * @brief For a given DPO type Register: + * - a virtual function table + * - a NULL terminated array of graph nodes from which that object type + * will originate packets, i.e. the nodes in which the object type will be + * the parent DPO in the DP graph. The ndoes are per-data-path protocol + * (see above). + * + * @param type + * The type being registered. + * + * @param vft + * The virtual function table to register for the type. + * + * @param nodes + * The string description of the per-protocol VLIB graph nodes. + */ +extern void dpo_register(dpo_type_t type, + const dpo_vft_t *vft, + const char * const * const * nodes); + +/** + * @brief Create and register a new DPO type. + * + * This can be used by plugins to create new DPO types that are not listed + * in dpo_type_t enum + * + * @param vft + * The virtual function table to register for the type. + * + * @param nodes + * The string description of the per-protocol VLIB graph nodes. + * + * @return The new dpo_type_t + */ +extern dpo_type_t dpo_register_new_type(const dpo_vft_t *vft, + const char * const * const * nodes); + +#endif diff --git a/src/vnet/dpo/drop_dpo.c b/src/vnet/dpo/drop_dpo.c new file mode 100644 index 00000000000..5118d2a45b7 --- /dev/null +++ b/src/vnet/dpo/drop_dpo.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing dropping the packet + */ + +#include <vnet/dpo/dpo.h> + +static dpo_id_t drop_dpos[DPO_PROTO_NUM]; + +const dpo_id_t * +drop_dpo_get (dpo_proto_t proto) +{ + dpo_set(&drop_dpos[proto], DPO_DROP, proto, proto); + + return (&drop_dpos[proto]); +} + +int +dpo_is_drop (const dpo_id_t *dpo) +{ + return (dpo->dpoi_type == DPO_DROP); +} + +static void +drop_dpo_lock (dpo_id_t *dpo) +{ + /* + * not maintaining a lock count on the drop + * more trouble than it's worth. + * There always needs to be one around. no point it managaing its lifetime + */ +} +static void +drop_dpo_unlock (dpo_id_t *dpo) +{ +} + +static u8* +format_drop_dpo (u8 *s, va_list *ap) +{ + CLIB_UNUSED(index_t index) = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + + return (format(s, "dpo-drop %U", format_dpo_proto, index)); +} + +const static dpo_vft_t drop_vft = { + .dv_lock = drop_dpo_lock, + .dv_unlock = drop_dpo_unlock, + .dv_format = format_drop_dpo, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a drop + * object. + * + * this means that these graph nodes are ones from which a drop is the + * parent object in the DPO-graph. + */ +const static char* const drop_ip4_nodes[] = +{ + "ip4-drop", + NULL, +}; +const static char* const drop_ip6_nodes[] = +{ + "ip6-drop", + NULL, +}; +const static char* const drop_mpls_nodes[] = +{ + "mpls-drop", + NULL, +}; +const static char* const drop_ethernet_nodes[] = +{ + "error-drop", + NULL, +}; +const static char* const * const drop_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = drop_ip4_nodes, + [DPO_PROTO_IP6] = drop_ip6_nodes, + [DPO_PROTO_MPLS] = drop_mpls_nodes, + [DPO_PROTO_ETHERNET] = drop_ethernet_nodes, +}; + +void +drop_dpo_module_init (void) +{ + dpo_register(DPO_DROP, &drop_vft, drop_nodes); +} diff --git a/src/vnet/dpo/drop_dpo.h b/src/vnet/dpo/drop_dpo.h new file mode 100644 index 00000000000..436df36c84e --- /dev/null +++ b/src/vnet/dpo/drop_dpo.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief The Drop DPO will drop all packets, no questions asked. It is valid + * for any packet protocol. + */ + +#ifndef __DROP_DPO_H__ +#define __DROP_DPO_H__ + +#include <vnet/dpo/dpo.h> + +extern int dpo_is_drop(const dpo_id_t *dpo); + +extern const dpo_id_t *drop_dpo_get(dpo_proto_t proto); + +extern void drop_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/ip_null_dpo.c b/src/vnet/dpo/ip_null_dpo.c new file mode 100644 index 00000000000..22682e4eee4 --- /dev/null +++ b/src/vnet/dpo/ip_null_dpo.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing dropping the packet + */ + +#include <vnet/dpo/ip_null_dpo.h> +#include <vnet/ip/ip.h> + +/** + * @brief A representation of the IP_NULL DPO + */ +typedef struct ip_null_dpo_t_ +{ + /** + * @brief The action to take on a packet + */ + ip_null_dpo_action_t ind_action; + /** + * @brief The next VLIB node + */ + u32 ind_next_index; + /** + * rate limits + */ +} ip_null_dpo_t; + +/** + * @brief the IP_NULL dpos are shared by all routes, hence they are global. + * As the neame implies this is only for IP, hence 2. + */ +static ip_null_dpo_t ip_null_dpos[2 * IP_NULL_DPO_ACTION_NUM] = { + [0] = { + /* proto ip4, no action */ + .ind_action = IP_NULL_ACTION_NONE, + }, + [1] = { + /* proto ip4, action send unreach */ + .ind_action = IP_NULL_ACTION_SEND_ICMP_UNREACH, + }, + [2] = { + /* proto ip4, action send unreach */ + .ind_action = IP_NULL_ACTION_SEND_ICMP_PROHIBIT, + }, + [3] = { + /* proto ip6, no action */ + .ind_action = IP_NULL_ACTION_NONE, + }, + [4] = { + /* proto ip6, action send unreach */ + .ind_action = IP_NULL_ACTION_SEND_ICMP_UNREACH, + }, + [5] = { + /* proto ip6, action send unreach */ + .ind_action = IP_NULL_ACTION_SEND_ICMP_PROHIBIT, + }, +}; + +/** + * @brief Action strings + */ +const char *ip_null_action_strings[] = IP_NULL_ACTIONS; + +void +ip_null_dpo_add_and_lock (dpo_proto_t proto, + ip_null_dpo_action_t action, + dpo_id_t *dpo) +{ + int i; + + ASSERT((proto == DPO_PROTO_IP4) || + (proto == DPO_PROTO_IP6)); + ASSERT(action < IP_NULL_DPO_ACTION_NUM); + + i = (proto == DPO_PROTO_IP4 ? 0 : 1); + + dpo_set(dpo, DPO_IP_NULL, proto, (i*IP_NULL_DPO_ACTION_NUM) + action); +} + +always_inline const ip_null_dpo_t* +ip_null_dpo_get (index_t indi) +{ + return (&ip_null_dpos[indi]); +} + +static void +ip_null_dpo_lock (dpo_id_t *dpo) +{ + /* + * not maintaining a lock count on the ip_null, they are const global and + * never die. + */ +} +static void +ip_null_dpo_unlock (dpo_id_t *dpo) +{ +} + +static u8* +format_ip_null_dpo (u8 *s, va_list *ap) +{ + index_t index = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + const ip_null_dpo_t *ind; + dpo_proto_t proto; + + ind = ip_null_dpo_get(index); + proto = (index < IP_NULL_DPO_ACTION_NUM ? DPO_PROTO_IP4 : DPO_PROTO_IP6); + + return (format(s, "%U-null action:%s", + format_dpo_proto, proto, + ip_null_action_strings[ind->ind_action])); +} + +const static dpo_vft_t ip_null_vft = { + .dv_lock = ip_null_dpo_lock, + .dv_unlock = ip_null_dpo_unlock, + .dv_format = format_ip_null_dpo, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a ip_null + * object. + * + * this means that these graph nodes are ones from which a ip_null is the + * parent object in the DPO-graph. + */ +const static char* const ip4_null_nodes[] = +{ + "ip4-null", + NULL, +}; +const static char* const ip6_null_nodes[] = +{ + "ip6-null", + NULL, +}; + +const static char* const * const ip_null_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = ip4_null_nodes, + [DPO_PROTO_IP6] = ip6_null_nodes, +}; + +typedef struct ip_null_dpo_trace_t_ +{ + index_t ind_index; +} ip_null_dpo_trace_t; + +/** + * @brief Exit nodes from a IP_NULL + */ +typedef enum ip_null_next_t_ +{ + IP_NULL_NEXT_DROP, + IP_NULL_NEXT_ICMP, + IP_NULL_NEXT_NUM, +} ip_null_next_t; + +always_inline uword +ip_null_dpo_switch (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + u8 is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + static f64 time_last_seed_change = -1e100; + static u32 hash_seeds[3]; + static uword hash_bitmap[256 / BITS (uword)]; + f64 time_now; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + time_now = vlib_time_now (vm); + if (time_now - time_last_seed_change > 1e-1) + { + uword i; + u32 * r = clib_random_buffer_get_data (&vm->random_buffer, + sizeof (hash_seeds)); + for (i = 0; i < ARRAY_LEN (hash_seeds); i++) + hash_seeds[i] = r[i]; + + /* Mark all hash keys as been not-seen before. */ + for (i = 0; i < ARRAY_LEN (hash_bitmap); i++) + hash_bitmap[i] = 0; + + time_last_seed_change = time_now; + } + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 a0, b0, c0, m0, drop0; + vlib_buffer_t *p0; + u32 bi0, indi0, next0; + const ip_null_dpo_t *ind0; + uword bm0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + + /* lookup dst + src mac */ + indi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + ind0 = ip_null_dpo_get(indi0); + next0 = IP_NULL_NEXT_DROP; + + /* + * rate limit - don't DoS the sender. + */ + a0 = hash_seeds[0]; + b0 = hash_seeds[1]; + c0 = hash_seeds[2]; + + if (is_ip4) + { + ip4_header_t *ip0 = vlib_buffer_get_current (p0); + + a0 ^= ip0->dst_address.data_u32; + b0 ^= ip0->src_address.data_u32; + + hash_v3_finalize32 (a0, b0, c0); + } + else + { + ip6_header_t *ip0 = vlib_buffer_get_current (p0); + + a0 ^= ip0->dst_address.as_u32[0]; + b0 ^= ip0->src_address.as_u32[0]; + c0 ^= ip0->src_address.as_u32[1]; + + hash_v3_mix32 (a0, b0, c0); + + a0 ^= ip0->dst_address.as_u32[1]; + b0 ^= ip0->src_address.as_u32[2]; + c0 ^= ip0->src_address.as_u32[3]; + + hash_v3_finalize32 (a0, b0, c0); + } + + c0 &= BITS (hash_bitmap) - 1; + c0 = c0 / BITS (uword); + m0 = (uword) 1 << (c0 % BITS (uword)); + + bm0 = hash_bitmap[c0]; + drop0 = (bm0 & m0) != 0; + + /* Mark it as seen. */ + hash_bitmap[c0] = bm0 | m0; + + if (PREDICT_FALSE(!drop0)) + { + if (is_ip4) + { + /* + * There's a trade-off here. This conditinal statement + * versus a graph node per-condition. Given the number + * expect number of packets to reach a null route is 0 + * we favour the run-time cost over the graph complexity + */ + if (IP_NULL_ACTION_SEND_ICMP_UNREACH == ind0->ind_action) + { + next0 = IP_NULL_NEXT_ICMP; + icmp4_error_set_vnet_buffer( + p0, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_destination_unreachable_host, + 0); + } + else if (IP_NULL_ACTION_SEND_ICMP_PROHIBIT == ind0->ind_action) + { + next0 = IP_NULL_NEXT_ICMP; + icmp4_error_set_vnet_buffer( + p0, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_host_administratively_prohibited, + 0); + } + } + else + { + if (IP_NULL_ACTION_SEND_ICMP_UNREACH == ind0->ind_action) + { + next0 = IP_NULL_NEXT_ICMP; + icmp6_error_set_vnet_buffer( + p0, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_no_route_to_destination, + 0); + } + else if (IP_NULL_ACTION_SEND_ICMP_PROHIBIT == ind0->ind_action) + { + next0 = IP_NULL_NEXT_ICMP; + icmp6_error_set_vnet_buffer( + p0, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_destination_administratively_prohibited, + 0); + } + } + } + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + ip_null_dpo_trace_t *tr = vlib_add_trace (vm, node, p0, + sizeof (*tr)); + tr->ind_index = indi0; + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static u8 * +format_ip_null_dpo_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip_null_dpo_trace_t *t = va_arg (*args, ip_null_dpo_trace_t *); + + s = format (s, "%U", format_ip_null_dpo, t->ind_index, 0); + return s; +} + +static uword +ip4_null_dpo_switch (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (ip_null_dpo_switch(vm, node, frame, 1)); +} + +/** + * @brief + */ +VLIB_REGISTER_NODE (ip4_null_dpo_node) = { + .function = ip4_null_dpo_switch, + .name = "ip4-null", + .vector_size = sizeof (u32), + + .format_trace = format_ip_null_dpo_trace, + .n_next_nodes = IP_NULL_NEXT_NUM, + .next_nodes = { + [IP_NULL_NEXT_DROP] = "ip4-drop", + [IP_NULL_NEXT_ICMP] = "ip4-icmp-error", + }, +}; + +static uword +ip6_null_dpo_switch (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (ip_null_dpo_switch(vm, node, frame, 0)); +} + +/** + * @brief + */ +VLIB_REGISTER_NODE (ip6_null_dpo_node) = { + .function = ip6_null_dpo_switch, + .name = "ip6-null", + .vector_size = sizeof (u32), + + .format_trace = format_ip_null_dpo_trace, + .n_next_nodes = IP_NULL_NEXT_NUM, + .next_nodes = { + [IP_NULL_NEXT_DROP] = "ip6-drop", + [IP_NULL_NEXT_ICMP] = "ip6-icmp-error", + }, +}; + +void +ip_null_dpo_module_init (void) +{ + dpo_register(DPO_IP_NULL, &ip_null_vft, ip_null_nodes); +} diff --git a/src/vnet/dpo/ip_null_dpo.h b/src/vnet/dpo/ip_null_dpo.h new file mode 100644 index 00000000000..002a2a7016d --- /dev/null +++ b/src/vnet/dpo/ip_null_dpo.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The IP NULL DPO represents the rubbish bin for IP traffic. Without specifying an + * action (i.e. send IMCP type X to sender) it is equivalent to using a drop DPO. + * However, in contrast to the drop DPO any route that resovles via a NULL, is + * considered to 'resolved' by FIB, i.e. a IP NULL is used when the control plane + * is explicitly expressing the desire to drop packets. Drop DPOs are used + * internally by FIB when resolution is not possible. + * + * Any replies to sender are rate limited. + */ + +#ifndef __IP_NULL_DPO_H__ +#define __IP_NULL_DPO_H__ + +#include <vnet/dpo/dpo.h> + +/** + * @brief Actions to take when a packet encounters the NULL DPO + */ +typedef enum ip_null_dpo_action_t_ +{ + IP_NULL_ACTION_NONE, + IP_NULL_ACTION_SEND_ICMP_UNREACH, + IP_NULL_ACTION_SEND_ICMP_PROHIBIT, +} ip_null_dpo_action_t; + +#define IP_NULL_ACTIONS { \ + [IP_NULL_ACTION_NONE] = "discard", \ + [IP_NULL_ACTION_SEND_ICMP_UNREACH] = "send-unreachable", \ + [IP_NULL_ACTION_SEND_ICMP_PROHIBIT] = "send-prohibited", \ +} + +#define IP_NULL_DPO_ACTION_NUM (IP_NULL_ACTION_SEND_ICMP_PROHIBIT+1) + +extern void ip_null_dpo_add_and_lock (dpo_proto_t proto, + ip_null_dpo_action_t action, + dpo_id_t *dpo); + +extern void ip_null_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/load_balance.c b/src/vnet/dpo/load_balance.c new file mode 100644 index 00000000000..a244776ffb8 --- /dev/null +++ b/src/vnet/dpo/load_balance.c @@ -0,0 +1,993 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/lookup.h> +#include <vnet/dpo/load_balance.h> +#include <vnet/dpo/load_balance_map.h> +#include <vnet/dpo/drop_dpo.h> +#include <vppinfra/math.h> /* for fabs */ +#include <vnet/adj/adj.h> +#include <vnet/adj/adj_internal.h> +#include <vnet/fib/fib_urpf_list.h> + +/* + * distribution error tolerance for load-balancing + */ +const f64 multipath_next_hop_error_tolerance = 0.1; + +#undef LB_DEBUG + +#ifdef LB_DEBUG +#define LB_DBG(_lb, _fmt, _args...) \ +{ \ + u8* _tmp =NULL; \ + clib_warning("lb:[%s]:" _fmt, \ + load_balance_format(load_balance_get_index((_lb)), \ + 0, _tmp), \ + ##_args); \ + vec_free(_tmp); \ +} +#else +#define LB_DBG(_p, _fmt, _args...) +#endif + + +/** + * Pool of all DPOs. It's not static so the DP can have fast access + */ +load_balance_t *load_balance_pool; + +/** + * The one instance of load-balance main + */ +load_balance_main_t load_balance_main; + +f64 +load_balance_get_multipath_tolerance (void) +{ + return (multipath_next_hop_error_tolerance); +} + +static inline index_t +load_balance_get_index (const load_balance_t *lb) +{ + return (lb - load_balance_pool); +} + +static inline dpo_id_t* +load_balance_get_buckets (load_balance_t *lb) +{ + if (LB_HAS_INLINE_BUCKETS(lb)) + { + return (lb->lb_buckets_inline); + } + else + { + return (lb->lb_buckets); + } +} + +static load_balance_t * +load_balance_alloc_i (void) +{ + load_balance_t *lb; + + pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES); + memset(lb, 0, sizeof(*lb)); + + lb->lb_map = INDEX_INVALID; + lb->lb_urpf = INDEX_INVALID; + vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters), + load_balance_get_index(lb)); + vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters), + load_balance_get_index(lb)); + vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters), + load_balance_get_index(lb)); + vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters), + load_balance_get_index(lb)); + + return (lb); +} + +static u8* +load_balance_format (index_t lbi, + load_balance_format_flags_t flags, + u32 indent, + u8 *s) +{ + vlib_counter_t to, via; + load_balance_t *lb; + dpo_id_t *buckets; + u32 i; + + lb = load_balance_get(lbi); + vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to); + vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via); + buckets = load_balance_get_buckets(lb); + + s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE); + s = format(s, "[index:%d buckets:%d ", lbi, lb->lb_n_buckets); + s = format(s, "uRPF:%d ", lb->lb_urpf); + s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes); + if (0 != via.packets) + { + s = format(s, " via:[%Ld:%Ld]", + via.packets, via.bytes); + } + s = format(s, "]"); + + if (INDEX_INVALID != lb->lb_map) + { + s = format(s, "\n%U%U", + format_white_space, indent+4, + format_load_balance_map, lb->lb_map, indent+4); + } + for (i = 0; i < lb->lb_n_buckets; i++) + { + s = format(s, "\n%U[%d] %U", + format_white_space, indent+2, + i, + format_dpo_id, + &buckets[i], indent+6); + } + return (s); +} + +u8* +format_load_balance (u8 * s, va_list * args) +{ + index_t lbi = va_arg(*args, index_t); + load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t); + + return (load_balance_format(lbi, flags, 0, s)); +} +static u8* +format_load_balance_dpo (u8 * s, va_list * args) +{ + index_t lbi = va_arg(*args, index_t); + u32 indent = va_arg(*args, u32); + + return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s)); +} + + +static load_balance_t * +load_balance_create_i (u32 num_buckets, + dpo_proto_t lb_proto, + flow_hash_config_t fhc) +{ + load_balance_t *lb; + + lb = load_balance_alloc_i(); + lb->lb_hash_config = fhc; + lb->lb_n_buckets = num_buckets; + lb->lb_n_buckets_minus_1 = num_buckets-1; + lb->lb_proto = lb_proto; + + if (!LB_HAS_INLINE_BUCKETS(lb)) + { + vec_validate_aligned(lb->lb_buckets, + lb->lb_n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + } + + LB_DBG(lb, "create"); + + return (lb); +} + +index_t +load_balance_create (u32 n_buckets, + dpo_proto_t lb_proto, + flow_hash_config_t fhc) +{ + return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc))); +} + +static inline void +load_balance_set_bucket_i (load_balance_t *lb, + u32 bucket, + dpo_id_t *buckets, + const dpo_id_t *next) +{ + dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next); +} + +void +load_balance_set_bucket (index_t lbi, + u32 bucket, + const dpo_id_t *next) +{ + load_balance_t *lb; + dpo_id_t *buckets; + + lb = load_balance_get(lbi); + buckets = load_balance_get_buckets(lb); + + ASSERT(bucket < lb->lb_n_buckets); + + load_balance_set_bucket_i(lb, bucket, buckets, next); +} + +int +load_balance_is_drop (const dpo_id_t *dpo) +{ + load_balance_t *lb; + + if (DPO_LOAD_BALANCE != dpo->dpoi_type) + return (0); + + lb = load_balance_get(dpo->dpoi_index); + + if (1 == lb->lb_n_buckets) + { + return (dpo_is_drop(load_balance_get_bucket_i(lb, 0))); + } + return (0); +} + +void +load_balance_set_urpf (index_t lbi, + index_t urpf) +{ + load_balance_t *lb; + index_t old; + + lb = load_balance_get(lbi); + + /* + * packets in flight we see this change. but it's atomic, so :P + */ + old = lb->lb_urpf; + lb->lb_urpf = urpf; + + fib_urpf_list_unlock(old); + fib_urpf_list_lock(urpf); +} + +index_t +load_balance_get_urpf (index_t lbi) +{ + load_balance_t *lb; + + lb = load_balance_get(lbi); + + return (lb->lb_urpf); +} + +const dpo_id_t * +load_balance_get_bucket (index_t lbi, + u32 bucket) +{ + load_balance_t *lb; + + lb = load_balance_get(lbi); + + return (load_balance_get_bucket_i(lb, bucket)); +} + +static int +next_hop_sort_by_weight (load_balance_path_t * n1, + load_balance_path_t * n2) +{ + return ((int) n1->path_weight - (int) n2->path_weight); +} + +/* Given next hop vector is over-written with normalized one with sorted weights and + with weights corresponding to the number of adjacencies for each next hop. + Returns number of adjacencies in block. */ +u32 +ip_multipath_normalize_next_hops (load_balance_path_t * raw_next_hops, + load_balance_path_t ** normalized_next_hops, + u32 *sum_weight_in, + f64 multipath_next_hop_error_tolerance) +{ + load_balance_path_t * nhs; + uword n_nhs, n_adj, n_adj_left, i, sum_weight; + f64 norm, error; + + n_nhs = vec_len (raw_next_hops); + ASSERT (n_nhs > 0); + if (n_nhs == 0) + return 0; + + /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */ + nhs = *normalized_next_hops; + vec_validate (nhs, 2*n_nhs - 1); + + /* Fast path: 1 next hop in block. */ + n_adj = n_nhs; + if (n_nhs == 1) + { + nhs[0] = raw_next_hops[0]; + nhs[0].path_weight = 1; + _vec_len (nhs) = 1; + sum_weight = 1; + goto done; + } + + else if (n_nhs == 2) + { + int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0; + + /* Fast sort. */ + nhs[0] = raw_next_hops[cmp]; + nhs[1] = raw_next_hops[cmp ^ 1]; + + /* Fast path: equal cost multipath with 2 next hops. */ + if (nhs[0].path_weight == nhs[1].path_weight) + { + nhs[0].path_weight = nhs[1].path_weight = 1; + _vec_len (nhs) = 2; + sum_weight = 2; + goto done; + } + } + else + { + clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0])); + qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight); + } + + /* Find total weight to normalize weights. */ + sum_weight = 0; + for (i = 0; i < n_nhs; i++) + sum_weight += nhs[i].path_weight; + + /* In the unlikely case that all weights are given as 0, set them all to 1. */ + if (sum_weight == 0) + { + for (i = 0; i < n_nhs; i++) + nhs[i].path_weight = 1; + sum_weight = n_nhs; + } + + /* Save copies of all next hop weights to avoid being overwritten in loop below. */ + for (i = 0; i < n_nhs; i++) + nhs[n_nhs + i].path_weight = nhs[i].path_weight; + + /* Try larger and larger power of 2 sized adjacency blocks until we + find one where traffic flows to within 1% of specified weights. */ + for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2) + { + error = 0; + + norm = n_adj / ((f64) sum_weight); + n_adj_left = n_adj; + for (i = 0; i < n_nhs; i++) + { + f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */ + word n = flt_round_nearest (nf); + + n = n > n_adj_left ? n_adj_left : n; + n_adj_left -= n; + error += fabs (nf - n); + nhs[i].path_weight = n; + + if (0 == nhs[i].path_weight) + { + /* + * when the weight skew is high (norm is small) and n == nf. + * without this correction the path with a low weight would have + * no represenation in the load-balanace - don't want that. + * If the weight skew is high so the load-balance has many buckets + * to allow it. pays ya money takes ya choice. + */ + error = n_adj; + break; + } + } + + nhs[0].path_weight += n_adj_left; + + /* Less than 5% average error per adjacency with this size adjacency block? */ + if (error <= multipath_next_hop_error_tolerance*n_adj) + { + /* Truncate any next hops with zero weight. */ + _vec_len (nhs) = i; + break; + } + } + +done: + /* Save vector for next call. */ + *normalized_next_hops = nhs; + *sum_weight_in = sum_weight; + return n_adj; +} + +static load_balance_path_t * +load_balance_multipath_next_hop_fixup (load_balance_path_t *nhs, + dpo_proto_t drop_proto) +{ + if (0 == vec_len(nhs)) + { + load_balance_path_t *nh; + + /* + * we need something for the load-balance. so use the drop + */ + vec_add2(nhs, nh, 1); + + nh->path_weight = 1; + dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto)); + } + + return (nhs); +} + +/* + * Fill in adjacencies in block based on corresponding + * next hop adjacencies. + */ +static void +load_balance_fill_buckets (load_balance_t *lb, + load_balance_path_t *nhs, + dpo_id_t *buckets, + u32 n_buckets) +{ + load_balance_path_t * nh; + u16 ii, bucket; + + bucket = 0; + + /* + * the next-hops have normalised weights. that means their sum is the number + * of buckets we need to fill. + */ + vec_foreach (nh, nhs) + { + for (ii = 0; ii < nh->path_weight; ii++) + { + ASSERT(bucket < n_buckets); + load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo); + } + } +} + +static inline void +load_balance_set_n_buckets (load_balance_t *lb, + u32 n_buckets) +{ + lb->lb_n_buckets = n_buckets; + lb->lb_n_buckets_minus_1 = n_buckets-1; +} + +void +load_balance_multipath_update (const dpo_id_t *dpo, + load_balance_path_t * raw_next_hops, + load_balance_flags_t flags) +{ + u32 sum_of_weights,n_buckets, ii; + load_balance_path_t * nh, * nhs; + index_t lbmi, old_lbmi; + load_balance_t *lb; + dpo_id_t *tmp_dpo; + + nhs = NULL; + + ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type); + lb = load_balance_get(dpo->dpoi_index); + raw_next_hops = + load_balance_multipath_next_hop_fixup(raw_next_hops, + lb->lb_proto); + n_buckets = + ip_multipath_normalize_next_hops(raw_next_hops, + &nhs, + &sum_of_weights, + multipath_next_hop_error_tolerance); + + ASSERT (n_buckets >= vec_len (raw_next_hops)); + + /* + * Save the old load-balance map used, and get a new one if required. + */ + old_lbmi = lb->lb_map; + if (flags & LOAD_BALANCE_FLAG_USES_MAP) + { + lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs); + } + else + { + lbmi = INDEX_INVALID; + } + + if (0 == lb->lb_n_buckets) + { + /* + * first time initialisation. no packets inflight, so we can write + * at leisure. + */ + load_balance_set_n_buckets(lb, n_buckets); + + if (!LB_HAS_INLINE_BUCKETS(lb)) + vec_validate_aligned(lb->lb_buckets, + lb->lb_n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + + load_balance_fill_buckets(lb, nhs, + load_balance_get_buckets(lb), + n_buckets); + lb->lb_map = lbmi; + } + else + { + /* + * This is a modification of an existing load-balance. + * We need to ensure that packets inflight see a consistent state, that + * is the number of reported buckets the LB has (read from + * lb_n_buckets_minus_1) is not more than it actually has. So if the + * number of buckets is increasing, we must update the bucket array first, + * then the reported number. vice-versa if the number of buckets goes down. + */ + if (n_buckets == lb->lb_n_buckets) + { + /* + * no change in the number of buckets. we can simply fill what + * is new over what is old. + */ + load_balance_fill_buckets(lb, nhs, + load_balance_get_buckets(lb), + n_buckets); + lb->lb_map = lbmi; + } + else if (n_buckets > lb->lb_n_buckets) + { + /* + * we have more buckets. the old load-balance map (if there is one) + * will remain valid, i.e. mapping to indices within range, so we + * update it last. + */ + if (n_buckets > LB_NUM_INLINE_BUCKETS && + lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS) + { + /* + * the new increased number of buckets is crossing the threshold + * from the inline storage to out-line. Alloc the outline buckets + * first, then fixup the number. then reset the inlines. + */ + ASSERT(NULL == lb->lb_buckets); + vec_validate_aligned(lb->lb_buckets, + n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + + load_balance_fill_buckets(lb, nhs, + lb->lb_buckets, + n_buckets); + CLIB_MEMORY_BARRIER(); + load_balance_set_n_buckets(lb, n_buckets); + + CLIB_MEMORY_BARRIER(); + + for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++) + { + dpo_reset(&lb->lb_buckets_inline[ii]); + } + } + else + { + if (n_buckets <= LB_NUM_INLINE_BUCKETS) + { + /* + * we are not crossing the threshold and it's still inline buckets. + * we can write the new on the old.. + */ + load_balance_fill_buckets(lb, nhs, + load_balance_get_buckets(lb), + n_buckets); + CLIB_MEMORY_BARRIER(); + load_balance_set_n_buckets(lb, n_buckets); + } + else + { + /* + * we are not crossing the threshold. We need a new bucket array to + * hold the increased number of choices. + */ + dpo_id_t *new_buckets, *old_buckets, *tmp_dpo; + + new_buckets = NULL; + old_buckets = load_balance_get_buckets(lb); + + vec_validate_aligned(new_buckets, + n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + + load_balance_fill_buckets(lb, nhs, new_buckets, n_buckets); + CLIB_MEMORY_BARRIER(); + lb->lb_buckets = new_buckets; + CLIB_MEMORY_BARRIER(); + load_balance_set_n_buckets(lb, n_buckets); + + vec_foreach(tmp_dpo, old_buckets) + { + dpo_reset(tmp_dpo); + } + vec_free(old_buckets); + } + } + + /* + * buckets fixed. ready for the MAP update. + */ + lb->lb_map = lbmi; + } + else + { + /* + * bucket size shrinkage. + * Any map we have will be based on the old + * larger number of buckets, so will be translating to indices + * out of range. So the new MAP must be installed first. + */ + lb->lb_map = lbmi; + CLIB_MEMORY_BARRIER(); + + + if (n_buckets <= LB_NUM_INLINE_BUCKETS && + lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS) + { + /* + * the new decreased number of buckets is crossing the threshold + * from out-line storage to inline: + * 1 - Fill the inline buckets, + * 2 - fixup the number (and this point the inline buckets are + * used). + * 3 - free the outline buckets + */ + load_balance_fill_buckets(lb, nhs, + lb->lb_buckets_inline, + n_buckets); + CLIB_MEMORY_BARRIER(); + load_balance_set_n_buckets(lb, n_buckets); + CLIB_MEMORY_BARRIER(); + + vec_foreach(tmp_dpo, lb->lb_buckets) + { + dpo_reset(tmp_dpo); + } + vec_free(lb->lb_buckets); + } + else + { + /* + * not crossing the threshold. + * 1 - update the number to the smaller size + * 2 - write the new buckets + * 3 - reset those no longer used. + */ + dpo_id_t *buckets; + u32 old_n_buckets; + + old_n_buckets = lb->lb_n_buckets; + buckets = load_balance_get_buckets(lb); + + load_balance_set_n_buckets(lb, n_buckets); + CLIB_MEMORY_BARRIER(); + + load_balance_fill_buckets(lb, nhs, + buckets, + n_buckets); + + for (ii = old_n_buckets-n_buckets; ii < old_n_buckets; ii++) + { + dpo_reset(&buckets[ii]); + } + } + } + } + + vec_foreach (nh, nhs) + { + dpo_reset(&nh->path_dpo); + } + vec_free(nhs); + + load_balance_map_unlock(old_lbmi); +} + +static void +load_balance_lock (dpo_id_t *dpo) +{ + load_balance_t *lb; + + lb = load_balance_get(dpo->dpoi_index); + + lb->lb_locks++; +} + +static void +load_balance_destroy (load_balance_t *lb) +{ + dpo_id_t *buckets; + int i; + + buckets = load_balance_get_buckets(lb); + + for (i = 0; i < lb->lb_n_buckets; i++) + { + dpo_reset(&buckets[i]); + } + + LB_DBG(lb, "destroy"); + if (!LB_HAS_INLINE_BUCKETS(lb)) + { + vec_free(lb->lb_buckets); + } + + fib_urpf_list_unlock(lb->lb_urpf); + load_balance_map_unlock(lb->lb_map); + + pool_put(load_balance_pool, lb); +} + +static void +load_balance_unlock (dpo_id_t *dpo) +{ + load_balance_t *lb; + + lb = load_balance_get(dpo->dpoi_index); + + lb->lb_locks--; + + if (0 == lb->lb_locks) + { + load_balance_destroy(lb); + } +} + +static void +load_balance_mem_show (void) +{ + fib_show_memory_usage("load-balance", + pool_elts(load_balance_pool), + pool_len(load_balance_pool), + sizeof(load_balance_t)); + load_balance_map_show_mem(); +} + +const static dpo_vft_t lb_vft = { + .dv_lock = load_balance_lock, + .dv_unlock = load_balance_unlock, + .dv_format = format_load_balance_dpo, + .dv_mem_show = load_balance_mem_show, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance + * object. + * + * this means that these graph nodes are ones from which a load-balance is the + * parent object in the DPO-graph. + * + * We do not list all the load-balance nodes, such as the *-lookup. instead + * we are relying on the correct use of the .sibling_of field when setting + * up these sibling nodes. + */ +const static char* const load_balance_ip4_nodes[] = +{ + "ip4-load-balance", + NULL, +}; +const static char* const load_balance_ip6_nodes[] = +{ + "ip6-load-balance", + NULL, +}; +const static char* const load_balance_mpls_nodes[] = +{ + "mpls-load-balance", + NULL, +}; +const static char* const load_balance_l2_nodes[] = +{ + "l2-load-balance", + NULL, +}; +const static char* const * const load_balance_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = load_balance_ip4_nodes, + [DPO_PROTO_IP6] = load_balance_ip6_nodes, + [DPO_PROTO_MPLS] = load_balance_mpls_nodes, + [DPO_PROTO_ETHERNET] = load_balance_l2_nodes, +}; + +void +load_balance_module_init (void) +{ + dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes); + + load_balance_map_module_init(); +} + +static clib_error_t * +load_balance_show (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + index_t lbi = INDEX_INVALID; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%d", &lbi)) + ; + else + break; + } + + if (INDEX_INVALID != lbi) + { + vlib_cli_output (vm, "%U", format_load_balance, lbi, + LOAD_BALANCE_FORMAT_DETAIL); + } + else + { + load_balance_t *lb; + + pool_foreach(lb, load_balance_pool, + ({ + vlib_cli_output (vm, "%U", format_load_balance, + load_balance_get_index(lb), + LOAD_BALANCE_FORMAT_NONE); + })); + } + + return 0; +} + +VLIB_CLI_COMMAND (load_balance_show_command, static) = { + .path = "show load-balance", + .short_help = "show load-balance [<index>]", + .function = load_balance_show, +}; + + +always_inline u32 +ip_flow_hash (void *data) +{ + ip4_header_t *iph = (ip4_header_t *) data; + + if ((iph->ip_version_and_header_length & 0xF0) == 0x40) + return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT); + else + return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT); +} + +always_inline u64 +mac_to_u64 (u8 * m) +{ + return (*((u64 *) m) & 0xffffffffffff); +} + +always_inline u32 +l2_flow_hash (vlib_buffer_t * b0) +{ + ethernet_header_t *eh; + u64 a, b, c; + uword is_ip, eh_size; + u16 eh_type; + + eh = vlib_buffer_get_current (b0); + eh_type = clib_net_to_host_u16 (eh->type); + eh_size = ethernet_buffer_header_size (b0); + + is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6); + + /* since we have 2 cache lines, use them */ + if (is_ip) + a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size); + else + a = eh->type; + + b = mac_to_u64 ((u8 *) eh->dst_address); + c = mac_to_u64 ((u8 *) eh->src_address); + hash_mix64 (a, b, c); + + return (u32) c; +} + +typedef struct load_balance_trace_t_ +{ + index_t lb_index; +} load_balance_trace_t; + +static uword +l2_load_balance (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, next_index, *from, *to_next; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + u32 bi0, lbi0, next0; + const dpo_id_t *dpo0; + const load_balance_t *lb0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* lookup dst + src mac */ + lbi0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; + lb0 = load_balance_get(lbi0); + + vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0); + + dpo0 = load_balance_get_bucket_i(lb0, + vnet_buffer(b0)->ip.flow_hash & + (lb0->lb_n_buckets_minus_1)); + + next0 = dpo0->dpoi_next_node; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + load_balance_trace_t *tr = vlib_add_trace (vm, node, b0, + sizeof (*tr)); + tr->lb_index = lbi0; + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static u8 * +format_load_balance_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *); + + s = format (s, "L2-load-balance: index %d", t->lb_index); + return s; +} + +/** + * @brief + */ +VLIB_REGISTER_NODE (l2_load_balance_node) = { + .function = l2_load_balance, + .name = "l2-load-balance", + .vector_size = sizeof (u32), + + .format_trace = format_load_balance_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; diff --git a/src/vnet/dpo/load_balance.h b/src/vnet/dpo/load_balance.h new file mode 100644 index 00000000000..dc6485e688a --- /dev/null +++ b/src/vnet/dpo/load_balance.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * \brief + * The load-balance object represents an ECMP choice. The buckets of a load + * balance object point to the sub-graph after the choice is made. + * THe load-balance object is also object type returned from a FIB table lookup. + * As such it needs to represent the case where there is only one coice. It may + * seem like overkill to use a load-balance object in this case, but the reason + * is for performance. If the load-balance object were not the result of the FIB + * lookup, then some other object would be. The case where there was ECMP + * this other object would need a load-balance as a parent and hence just add + * an unnecessary indirection. + * + * It is also the object in the DP that represents a via-fib-entry in a recursive + * route. + * + */ + +#ifndef __LOAD_BALANCE_H__ +#define __LOAD_BALANCE_H__ + +#include <vlib/vlib.h> +#include <vnet/ip/lookup.h> +#include <vnet/dpo/dpo.h> +#include <vnet/fib/fib_types.h> + +/** + * Load-balance main + */ +typedef struct load_balance_main_t_ +{ + vlib_combined_counter_main_t lbm_to_counters; + vlib_combined_counter_main_t lbm_via_counters; +} load_balance_main_t; + +extern load_balance_main_t load_balance_main; + +/** + * The number of buckets that a load-balance object can have and still + * fit in one cache-line + */ +#define LB_NUM_INLINE_BUCKETS 4 + +/** + * @brief One path from an [EU]CMP set that the client wants to add to a + * load-balance object + */ +typedef struct load_balance_path_t_ { + /** + * ID of the Data-path object. + */ + dpo_id_t path_dpo; + + /** + * The index of the FIB path + */ + fib_node_index_t path_index; + + /** + * weight for the path. + */ + u32 path_weight; +} load_balance_path_t; + +/** + * The FIB DPO provieds; + * - load-balancing over the next DPOs in the chain/graph + * - per-route counters + */ +typedef struct load_balance_t_ { + /** + * number of buckets in the load-balance. always a power of 2. + */ + u16 lb_n_buckets; + /** + * number of buckets in the load-balance - 1. used in the switch path + * as part of the hash calculation. + */ + u16 lb_n_buckets_minus_1; + + /** + * The protocol of packets that traverse this LB. + * need in combination with the flow hash config to determine how to hash. + * u8. + */ + dpo_proto_t lb_proto; + + /** + * The number of locks, which is approximately the number of users, + * of this load-balance. + * Load-balance objects of via-entries are heavily shared by recursives, + * so the lock count is a u32. + */ + u32 lb_locks; + + /** + * index of the load-balance map, INVALID if this LB does not use one + */ + index_t lb_map; + + /** + * This is the index of the uRPF list for this LB + */ + index_t lb_urpf; + + /** + * the hash config to use when selecting a bucket. this is a u16 + */ + flow_hash_config_t lb_hash_config; + + /** + * Vector of buckets containing the next DPOs, sized as lbo_num + */ + dpo_id_t *lb_buckets; + + /** + * The rest of the cache line is used for buckets. In the common case + * where there there are less than 4 buckets, then the buckets are + * on the same cachlie and we save ourselves a pointer dereferance in + * the data-path. + */ + dpo_id_t lb_buckets_inline[LB_NUM_INLINE_BUCKETS]; +} load_balance_t; + +STATIC_ASSERT(sizeof(load_balance_t) <= CLIB_CACHE_LINE_BYTES, + "A load_balance object size exceeds one cachline"); + +/** + * Flags controlling load-balance formatting/display + */ +typedef enum load_balance_format_flags_t_ { + LOAD_BALANCE_FORMAT_NONE, + LOAD_BALANCE_FORMAT_DETAIL = (1 << 0), +} load_balance_format_flags_t; + +/** + * Flags controlling load-balance creation and modification + */ +typedef enum load_balance_flags_t_ { + LOAD_BALANCE_FLAG_NONE = 0, + LOAD_BALANCE_FLAG_USES_MAP = (1 << 0), +} load_balance_flags_t; + +extern index_t load_balance_create(u32 num_buckets, + dpo_proto_t lb_proto, + flow_hash_config_t fhc); +extern void load_balance_multipath_update( + const dpo_id_t *dpo, + load_balance_path_t * raw_next_hops, + load_balance_flags_t flags); + +extern void load_balance_set_bucket(index_t lbi, + u32 bucket, + const dpo_id_t *next); +extern void load_balance_set_urpf(index_t lbi, + index_t urpf); +extern index_t load_balance_get_urpf(index_t lbi); + +extern u8* format_load_balance(u8 * s, va_list * args); + +extern const dpo_id_t *load_balance_get_bucket(index_t lbi, + u32 bucket); +extern int load_balance_is_drop(const dpo_id_t *dpo); + +extern f64 load_balance_get_multipath_tolerance(void); + +/** + * The encapsulation breakages are for fast DP access + */ +extern load_balance_t *load_balance_pool; +static inline load_balance_t* +load_balance_get (index_t lbi) +{ + return (pool_elt_at_index(load_balance_pool, lbi)); +} + +#define LB_HAS_INLINE_BUCKETS(_lb) \ + ((_lb)->lb_n_buckets <= LB_NUM_INLINE_BUCKETS) + +static inline const dpo_id_t * +load_balance_get_bucket_i (const load_balance_t *lb, + u32 bucket) +{ + ASSERT(bucket < lb->lb_n_buckets); + + if (PREDICT_TRUE(LB_HAS_INLINE_BUCKETS(lb))) + { + return (&lb->lb_buckets_inline[bucket]); + } + else + { + return (&lb->lb_buckets[bucket]); + } +} + +extern void load_balance_module_init(void); + +#endif diff --git a/src/vnet/dpo/load_balance_map.c b/src/vnet/dpo/load_balance_map.c new file mode 100644 index 00000000000..70ce1bf7c39 --- /dev/null +++ b/src/vnet/dpo/load_balance_map.c @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + */ +#include <vnet/fib/fib_path.h> +#include <vnet/fib/fib_node_list.h> +#include <vnet/dpo/load_balance_map.h> +#include <vnet/dpo/load_balance.h> + +/** + * A hash-table of load-balance maps by path index. + * this provides the fast lookup of the LB map when a path goes down + */ +static uword *lb_maps_by_path_index; + +/** + * A hash-table of load-balance maps by set of paths. + * This provides the LB map sharing. + * LB maps do not necessarily use all the paths in the list, since + * the entry that is requesting the map, may not have an out-going + * label for each of the paths. + */ +static uword *load_balance_map_db; + +typedef enum load_balance_map_path_flags_t_ +{ + LOAD_BALANCE_MAP_PATH_UP = (1 << 0), + LOAD_BALANCE_MAP_PATH_USABLE = (1 << 1), +} __attribute__ ((packed)) load_balance_map_path_flags_t; + +typedef struct load_balance_map_path_t_ { + /** + * Index of the path + */ + fib_node_index_t lbmp_index; + + /** + * Sibling Index in the list of all maps with this path index + */ + fib_node_index_t lbmp_sibling; + + /** + * the normalised wegiht of the path + */ + u32 lbmp_weight; + + /** + * The sate of the path + */ + load_balance_map_path_flags_t lbmp_flags; +} load_balance_map_path_t; + +/** + * The global pool of LB maps + */ +load_balance_map_t *load_balance_map_pool; + +/* + * Debug macro + */ +#ifdef FIB_DEBUG +#define LOAD_BALANCE_MAP_DBG(_pl, _fmt, _args...) \ + { \ + clib_warning("lbm: FIXME" _fmt, \ + ##_args); \ + } +#else +#define LOAD_BALANCE_MAP_DBG(_pl, _fmt, _args...) +#endif + +static index_t +load_balance_map_get_index (load_balance_map_t *lbm) +{ + return (lbm - load_balance_map_pool); +} + +u8* +format_load_balance_map (u8 *s, va_list ap) +{ + index_t lbmi = va_arg(ap, index_t); + u32 indent = va_arg(ap, u32); + load_balance_map_t *lbm; + u32 n_buckets, ii; + + lbm = load_balance_map_get(lbmi); + n_buckets = vec_len(lbm->lbm_buckets); + + s = format(s, "load-balance-map: index:%d buckets:%d", lbmi, n_buckets); + s = format(s, "\n%U index:", format_white_space, indent+2); + for (ii = 0; ii < n_buckets; ii++) + { + s = format(s, "%5d", ii); + } + s = format(s, "\n%U map:", format_white_space, indent+2); + for (ii = 0; ii < n_buckets; ii++) + { + s = format(s, "%5d", lbm->lbm_buckets[ii]); + } + + return (s); +} + + +static uword +load_balance_map_hash (load_balance_map_t *lbm) +{ + u32 old_lbm_hash, new_lbm_hash, hash; + load_balance_map_path_t *lb_path; + + new_lbm_hash = old_lbm_hash = vec_len(lbm->lbm_paths); + + vec_foreach (lb_path, lbm->lbm_paths) + { + hash = lb_path->lbmp_index; + hash_mix32(hash, old_lbm_hash, new_lbm_hash); + } + + return (new_lbm_hash); +} + +always_inline uword +load_balance_map_db_hash_key_from_index (uword index) +{ + return 1 + 2*index; +} + +always_inline uword +load_balance_map_db_hash_key_is_index (uword key) +{ + return key & 1; +} + +always_inline uword +load_balance_map_db_hash_key_2_index (uword key) +{ + ASSERT (load_balance_map_db_hash_key_is_index (key)); + return key / 2; +} + +static load_balance_map_t* +load_balance_map_db_get_from_hash_key (uword key) +{ + load_balance_map_t *lbm; + + if (load_balance_map_db_hash_key_is_index (key)) + { + index_t lbm_index; + + lbm_index = load_balance_map_db_hash_key_2_index(key); + lbm = load_balance_map_get(lbm_index); + } + else + { + lbm = uword_to_pointer (key, load_balance_map_t *); + } + + return (lbm); +} + +static uword +load_balance_map_db_hash_key_sum (hash_t * h, + uword key) +{ + load_balance_map_t *lbm; + + lbm = load_balance_map_db_get_from_hash_key(key); + + return (load_balance_map_hash(lbm)); +} + +static uword +load_balance_map_db_hash_key_equal (hash_t * h, + uword key1, + uword key2) +{ + load_balance_map_t *lbm1, *lbm2; + + lbm1 = load_balance_map_db_get_from_hash_key(key1); + lbm2 = load_balance_map_db_get_from_hash_key(key2); + + return (load_balance_map_hash(lbm1) == + load_balance_map_hash(lbm2)); +} + +static index_t +load_balance_map_db_find (load_balance_map_t *lbm) +{ + uword *p; + + p = hash_get(load_balance_map_db, lbm); + + if (NULL != p) + { + return p[0]; + } + + return (FIB_NODE_INDEX_INVALID); +} + +static void +load_balance_map_db_insert (load_balance_map_t *lbm) +{ + load_balance_map_path_t *lbmp; + fib_node_list_t list; + uword *p; + + ASSERT(FIB_NODE_INDEX_INVALID == load_balance_map_db_find(lbm)); + + /* + * insert into the DB based on the set of paths. + */ + hash_set (load_balance_map_db, + load_balance_map_db_hash_key_from_index( + load_balance_map_get_index(lbm)), + load_balance_map_get_index(lbm)); + + /* + * insert into each per-path list. + */ + vec_foreach(lbmp, lbm->lbm_paths) + { + p = hash_get(lb_maps_by_path_index, lbmp->lbmp_index); + + if (NULL == p) + { + list = fib_node_list_create(); + hash_set(lb_maps_by_path_index, lbmp->lbmp_index, list); + } + else + { + list = p[0]; + } + + lbmp->lbmp_sibling = + fib_node_list_push_front(list, + 0, FIB_NODE_TYPE_FIRST, + load_balance_map_get_index(lbm)); + } + + LOAD_BALANCE_MAP_DBG(lbm, "DB-inserted"); +} + +static void +load_balance_map_db_remove (load_balance_map_t *lbm) +{ + load_balance_map_path_t *lbmp; + uword *p; + + ASSERT(FIB_NODE_INDEX_INVALID != load_balance_map_db_find(lbm)); + + hash_unset(load_balance_map_db, + load_balance_map_db_hash_key_from_index( + load_balance_map_get_index(lbm))); + + /* + * remove from each per-path list. + */ + vec_foreach(lbmp, lbm->lbm_paths) + { + p = hash_get(lb_maps_by_path_index, lbmp->lbmp_index); + + ASSERT(NULL != p); + + fib_node_list_remove(p[0], lbmp->lbmp_sibling); + } + + LOAD_BALANCE_MAP_DBG(lbm, "DB-removed"); +} + +/** + * @brief from the paths that are usable, fill the Map. + */ +static void +load_balance_map_fill (load_balance_map_t *lbm) +{ + load_balance_map_path_t *lbmp; + u32 n_buckets, bucket, ii, jj; + u16 *tmp_buckets; + + tmp_buckets = NULL; + n_buckets = vec_len(lbm->lbm_buckets); + + /* + * run throught the set of paths once, and build a vector of the + * indices that are usable. we do this is a scratch space, since we + * need to refer to it multiple times as we build the real buckets. + */ + vec_validate(tmp_buckets, n_buckets-1); + + bucket = jj = 0; + vec_foreach (lbmp, lbm->lbm_paths) + { + if (fib_path_is_resolved(lbmp->lbmp_index)) + { + for (ii = 0; ii < lbmp->lbmp_weight; ii++) + { + tmp_buckets[jj++] = bucket++; + } + } + else + { + bucket += lbmp->lbmp_weight; + } + } + _vec_len(tmp_buckets) = jj; + + /* + * If the number of temporaries written is as many as we need, implying + * all paths were up, then we can simply copy the scratch area over the + * actual buckets' memory + */ + if (jj == n_buckets) + { + memcpy(lbm->lbm_buckets, + tmp_buckets, + sizeof(lbm->lbm_buckets[0]) * n_buckets); + } + else + { + /* + * one or more paths are down. + */ + if (0 == vec_len(tmp_buckets)) + { + /* + * if the scratch area is empty, then no paths are usable. + * they will all drop. so use them all, lest we account drops + * against only one. + */ + for (bucket = 0; bucket < n_buckets; bucket++) + { + lbm->lbm_buckets[bucket] = bucket; + } + } + else + { + bucket = jj = 0; + vec_foreach (lbmp, lbm->lbm_paths) + { + if (fib_path_is_resolved(lbmp->lbmp_index)) + { + for (ii = 0; ii < lbmp->lbmp_weight; ii++) + { + lbm->lbm_buckets[bucket] = bucket; + bucket++; + } + } + else + { + /* + * path is unusable + * cycle through the scratch space selecting a index. + * this means we load balance, in the intended ratio, + * over the paths that are still usable. + */ + for (ii = 0; ii < lbmp->lbmp_weight; ii++) + { + lbm->lbm_buckets[bucket] = tmp_buckets[jj]; + jj = (jj + 1) % vec_len(tmp_buckets); + bucket++; + } + } + } + } + } + + vec_free(tmp_buckets); +} + +static load_balance_map_t* +load_balance_map_alloc (const load_balance_path_t *paths) +{ + load_balance_map_t *lbm; + u32 ii; + + pool_get_aligned(load_balance_map_pool, lbm, CLIB_CACHE_LINE_BYTES); + memset(lbm, 0, sizeof(*lbm)); + + vec_validate(lbm->lbm_paths, vec_len(paths)-1); + + vec_foreach_index(ii, paths) + { + lbm->lbm_paths[ii].lbmp_index = paths[ii].path_index; + lbm->lbm_paths[ii].lbmp_weight = paths[ii].path_weight; + } + + return (lbm); +} + +static load_balance_map_t * +load_balance_map_init (load_balance_map_t *lbm, + u32 n_buckets, + u32 sum_of_weights) +{ + lbm->lbm_sum_of_norm_weights = sum_of_weights; + vec_validate(lbm->lbm_buckets, n_buckets-1); + + load_balance_map_db_insert(lbm); + + load_balance_map_fill(lbm); + + return (lbm); +} + +index_t +load_balance_map_add_or_lock (u32 n_buckets, + u32 sum_of_weights, + const load_balance_path_t *paths) +{ + load_balance_map_t *tmp, *lbm; + index_t lbmi; + + tmp = load_balance_map_alloc(paths); + + lbmi = load_balance_map_db_find(tmp); + + if (INDEX_INVALID == lbmi) + { + lbm = load_balance_map_init(tmp, n_buckets, sum_of_weights); + } + else + { + lbm = load_balance_map_get(lbmi); + } + + lbm->lbm_locks++; + + return (load_balance_map_get_index(lbm)); +} + +void +load_balance_map_lock (index_t lbmi) +{ + load_balance_map_t *lbm; + + lbm = load_balance_map_get(lbmi); + + lbm->lbm_locks++; +} + +void +load_balance_map_unlock (index_t lbmi) +{ + load_balance_map_t *lbm; + + if (INDEX_INVALID == lbmi) + { + return; + } + + lbm = load_balance_map_get(lbmi); + + lbm->lbm_locks--; + + if (0 == lbm->lbm_locks) + { + load_balance_map_db_remove(lbm); + vec_free(lbm->lbm_paths); + vec_free(lbm->lbm_buckets); + pool_put(load_balance_map_pool, lbm); + } +} + +static int +load_balance_map_path_state_change_walk (fib_node_ptr_t *fptr, + void *ctx) +{ + load_balance_map_t *lbm; + + lbm = load_balance_map_get(fptr->fnp_index); + + load_balance_map_fill(lbm); + + return (!0); +} + +/** + * @brief the state of a path has changed (it has no doubt gone down). + * This is the trigger to perform a PIC edge cutover and update the maps + * to exclude this path. + */ +void +load_balance_map_path_state_change (fib_node_index_t path_index) +{ + uword *p; + + /* + * re-stripe the buckets for each affect MAP + */ + p = hash_get(lb_maps_by_path_index, path_index); + + if (NULL == p) + return; + + fib_node_list_walk(p[0], load_balance_map_path_state_change_walk, NULL); +} + +/** + * @brief Make/add a new or lock an existing Load-balance map + */ +void +load_balance_map_module_init (void) +{ + load_balance_map_db = + hash_create2 (/* elts */ 0, + /* user */ 0, + /* value_bytes */ sizeof (index_t), + load_balance_map_db_hash_key_sum, + load_balance_map_db_hash_key_equal, + /* format pair/arg */ + 0, 0); + + lb_maps_by_path_index = hash_create(0, sizeof(fib_node_list_t)); +} + +void +load_balance_map_show_mem (void) +{ + fib_show_memory_usage("Load-Balance Map", + pool_elts(load_balance_map_pool), + pool_len(load_balance_map_pool), + sizeof(load_balance_map_t)); +} + +static clib_error_t * +load_balance_map_show (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + index_t lbmi = INDEX_INVALID; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%d", &lbmi)) + ; + else + break; + } + + if (INDEX_INVALID != lbmi) + { + vlib_cli_output (vm, "%U", format_load_balance_map, lbmi, 0); + } + else + { + load_balance_map_t *lbm; + + pool_foreach(lbm, load_balance_map_pool, + ({ + vlib_cli_output (vm, "%U", format_load_balance_map, + load_balance_map_get_index(lbm), 0); + })); + } + + return 0; +} + +VLIB_CLI_COMMAND (load_balance_map_show_command, static) = { + .path = "show load-balance-map", + .short_help = "show load-balance-map [<index>]", + .function = load_balance_map_show, +}; diff --git a/src/vnet/dpo/load_balance_map.h b/src/vnet/dpo/load_balance_map.h new file mode 100644 index 00000000000..454bf4b3763 --- /dev/null +++ b/src/vnet/dpo/load_balance_map.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + */ + +#ifndef __LOAD_BALANCE_MAP_H__ +#define __LOAD_BALANCE_MAP_H__ + +#include <vlib/vlib.h> +#include <vnet/fib/fib_types.h> +#include <vnet/dpo/load_balance.h> + +struct load_balance_map_path_t_; + +/** + */ +typedef struct load_balance_map_t_ { + /** + * The buckets of the map that provide the index to index translation. + * In the first cacheline. + */ + u16 *lbm_buckets; + + /** + * the vector of paths this MAP represents + */ + struct load_balance_map_path_t_ *lbm_paths; + + /** + * the sum of the normalised weights. cache for convenience + */ + u32 lbm_sum_of_norm_weights; + + /** + * Number of locks. Maps are shared by a large number of recrusvie fib_entry_ts + */ + u32 lbm_locks; +} load_balance_map_t; + +extern index_t load_balance_map_add_or_lock(u32 n_buckets, + u32 sum_of_weights, + const load_balance_path_t *norm_paths); + +extern void load_balance_map_lock(index_t lmbi); +extern void load_balance_map_unlock(index_t lbmi); + +extern void load_balance_map_path_state_change(fib_node_index_t path_index); + +extern u8* format_load_balance_map(u8 *s, va_list ap); +extern void load_balance_map_show_mem(void); + +/** + * The encapsulation breakages are for fast DP access + */ +extern load_balance_map_t *load_balance_map_pool; + +static inline load_balance_map_t* +load_balance_map_get (index_t lbmi) +{ + return (pool_elt_at_index(load_balance_map_pool, lbmi)); +} + + +extern void load_balance_map_module_init(void); + +#endif diff --git a/src/vnet/dpo/lookup_dpo.c b/src/vnet/dpo/lookup_dpo.c new file mode 100644 index 00000000000..96fedd27ce9 --- /dev/null +++ b/src/vnet/dpo/lookup_dpo.c @@ -0,0 +1,1185 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/ip.h> +#include <vnet/dpo/lookup_dpo.h> +#include <vnet/dpo/load_balance.h> +#include <vnet/mpls/mpls.h> +#include <vnet/fib/fib_table.h> +#include <vnet/fib/ip4_fib.h> +#include <vnet/fib/ip6_fib.h> +#include <vnet/fib/mpls_fib.h> + +static const char *const lookup_input_names[] = LOOKUP_INPUTS; + +/** + * @brief Enumeration of the lookup subtypes + */ +typedef enum lookup_sub_type_t_ +{ + LOOKUP_SUB_TYPE_SRC, + LOOKUP_SUB_TYPE_DST, + LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE, +} lookup_sub_type_t; +#define LOOKUP_SUB_TYPE_NUM (LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE+1) + +#define FOR_EACH_LOOKUP_SUB_TYPE(_st) \ + for (_st = LOOKUP_SUB_TYPE_IP4_SRC; _st < LOOKUP_SUB_TYPE_NUM; _st++) + +/** + * @brief pool of all MPLS Label DPOs + */ +lookup_dpo_t *lookup_dpo_pool; + +/** + * @brief An array of registered DPO type values for the sub-types + */ +static dpo_type_t lookup_dpo_sub_types[LOOKUP_SUB_TYPE_NUM]; + +static lookup_dpo_t * +lookup_dpo_alloc (void) +{ + lookup_dpo_t *lkd; + + pool_get_aligned(lookup_dpo_pool, lkd, CLIB_CACHE_LINE_BYTES); + + return (lkd); +} + +static index_t +lookup_dpo_get_index (lookup_dpo_t *lkd) +{ + return (lkd - lookup_dpo_pool); +} + +static void +lookup_dpo_add_or_lock_i (fib_node_index_t fib_index, + dpo_proto_t proto, + lookup_input_t input, + lookup_table_t table_config, + dpo_id_t *dpo) +{ + lookup_dpo_t *lkd; + dpo_type_t type; + + lkd = lookup_dpo_alloc(); + lkd->lkd_fib_index = fib_index; + lkd->lkd_proto = proto; + lkd->lkd_input = input; + lkd->lkd_table = table_config; + + /* + * use the input type to select the lookup sub-type + */ + type = 0; + + switch (input) + { + case LOOKUP_INPUT_SRC_ADDR: + type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_SRC]; + break; + case LOOKUP_INPUT_DST_ADDR: + switch (table_config) + { + case LOOKUP_TABLE_FROM_INPUT_INTERFACE: + type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE]; + break; + case LOOKUP_TABLE_FROM_CONFIG: + type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST]; + break; + } + } + + if (0 == type) + { + dpo_reset(dpo); + } + else + { + dpo_set(dpo, type, proto, lookup_dpo_get_index(lkd)); + } +} + +void +lookup_dpo_add_or_lock_w_fib_index (fib_node_index_t fib_index, + dpo_proto_t proto, + lookup_input_t input, + lookup_table_t table_config, + dpo_id_t *dpo) +{ + if (LOOKUP_TABLE_FROM_CONFIG == table_config) + { + fib_table_lock(fib_index, dpo_proto_to_fib(proto)); + } + lookup_dpo_add_or_lock_i(fib_index, proto, input, table_config, dpo); +} + +void +lookup_dpo_add_or_lock_w_table_id (u32 table_id, + dpo_proto_t proto, + lookup_input_t input, + lookup_table_t table_config, + dpo_id_t *dpo) +{ + fib_node_index_t fib_index = FIB_NODE_INDEX_INVALID; + + if (LOOKUP_TABLE_FROM_CONFIG == table_config) + { + fib_index = + fib_table_find_or_create_and_lock(dpo_proto_to_fib(proto), + table_id); + } + + ASSERT(FIB_NODE_INDEX_INVALID != fib_index); + lookup_dpo_add_or_lock_i(fib_index, proto, input, table_config, dpo); +} + +u8* +format_lookup_dpo (u8 *s, va_list *args) +{ + index_t index = va_arg (*args, index_t); + lookup_dpo_t *lkd; + + lkd = lookup_dpo_get(index); + + if (LOOKUP_TABLE_FROM_INPUT_INTERFACE == lkd->lkd_table) + { + s = format(s, "%s lookup in interface's %U table", + lookup_input_names[lkd->lkd_input], + format_dpo_proto, lkd->lkd_proto); + } + else + { + s = format(s, "%s lookup in %U", + lookup_input_names[lkd->lkd_input], + format_fib_table_name, lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto)); + } + return (s); +} + +static void +lookup_dpo_lock (dpo_id_t *dpo) +{ + lookup_dpo_t *lkd; + + lkd = lookup_dpo_get(dpo->dpoi_index); + + lkd->lkd_locks++; +} + +static void +lookup_dpo_unlock (dpo_id_t *dpo) +{ + lookup_dpo_t *lkd; + + lkd = lookup_dpo_get(dpo->dpoi_index); + + lkd->lkd_locks--; + + if (0 == lkd->lkd_locks) + { + if (LOOKUP_TABLE_FROM_CONFIG == lkd->lkd_table) + { + fib_table_unlock(lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto)); + } + pool_put(lookup_dpo_pool, lkd); + } +} + +always_inline void +ip4_src_fib_lookup_one (u32 src_fib_index0, + const ip4_address_t * addr0, + u32 * src_adj_index0) +{ + ip4_fib_mtrie_leaf_t leaf0, leaf1; + ip4_fib_mtrie_t * mtrie0; + + mtrie0 = &ip4_fib_get (src_fib_index0)->mtrie; + + leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT; + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 0); + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 1); + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 2); + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 3); + + /* Handle default route. */ + leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0); + src_adj_index0[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf0); +} + +always_inline void +ip4_src_fib_lookup_two (u32 src_fib_index0, + u32 src_fib_index1, + const ip4_address_t * addr0, + const ip4_address_t * addr1, + u32 * src_adj_index0, + u32 * src_adj_index1) +{ + ip4_fib_mtrie_leaf_t leaf0, leaf1; + ip4_fib_mtrie_t * mtrie0, * mtrie1; + + mtrie0 = &ip4_fib_get (src_fib_index0)->mtrie; + mtrie1 = &ip4_fib_get (src_fib_index1)->mtrie; + + leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 0); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 0); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 1); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 1); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 2); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 2); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 3); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 3); + + /* Handle default route. */ + leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0); + leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1); + src_adj_index0[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + src_adj_index1[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf1); +} + +/** + * @brief Lookup trace data + */ +typedef struct lookup_trace_t_ +{ + union { + ip46_address_t addr; + mpls_unicast_header_t hdr; + }; + fib_node_index_t fib_index; + index_t lbi; +} lookup_trace_t; + + +always_inline uword +lookup_dpo_ip4_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + int input_src_addr, + int table_from_interface) +{ + u32 n_left_from, next_index, * from, * to_next; + u32 cpu_index = os_get_cpu_number(); + vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next > 2) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0, hash_c0; + flow_hash_config_t flow_hash_config0; + const ip4_address_t *input_addr0; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const ip4_header_t * ip0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + u32 bi1, lkdi1, lbi1, fib_index1, next1, hash_c1; + flow_hash_config_t flow_hash_config1; + const ip4_address_t *input_addr1; + const load_balance_t *lb1; + const lookup_dpo_t * lkd1; + const ip4_header_t * ip1; + const dpo_id_t *dpo1; + vlib_buffer_t * b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + bi0 = from[0]; + to_next[0] = bi0; + bi1 = from[1]; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (b0); + b1 = vlib_get_buffer (vm, bi1); + ip1 = vlib_buffer_get_current (b1); + + /* dst lookup was done by ip4 lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkdi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + lkd1 = lookup_dpo_get(lkdi1); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + ip4_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + fib_index1 = + ip4_fib_table_get_index_for_sw_if_index( + vnet_buffer(b1)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + fib_index1 = lkd1->lkd_fib_index; + } + + /* + * choose between a source or destination address lookup in the table + */ + if (input_src_addr) + { + input_addr0 = &ip0->src_address; + input_addr1 = &ip1->src_address; + } + else + { + input_addr0 = &ip0->dst_address; + input_addr1 = &ip1->dst_address; + } + + /* do lookup */ + ip4_src_fib_lookup_two (fib_index0, fib_index1, + input_addr0, input_addr1, + &lbi0, &lbi1); + lb0 = load_balance_get(lbi0); + lb1 = load_balance_get(lbi1); + + vnet_buffer(b0)->sw_if_index[VLIB_TX] = fib_index0; + vnet_buffer(b1)->sw_if_index[VLIB_TX] = fib_index1; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (b0)->ip.flow_hash = 0; + hash_c1 = vnet_buffer (b1)->ip.flow_hash = 0; + + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + flow_hash_config0 = lb0->lb_hash_config; + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + ip4_compute_flow_hash (ip0, flow_hash_config0); + } + + if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) + { + flow_hash_config1 = lb1->lb_hash_config; + hash_c1 = vnet_buffer (b1)->ip.flow_hash = + ip4_compute_flow_hash (ip1, flow_hash_config1); + } + + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + dpo1 = load_balance_get_bucket_i(lb1, + (hash_c1 & + (lb1->lb_n_buckets_minus_1))); + + next0 = dpo0->dpoi_next_node; + next1 = dpo1->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; + + vlib_increment_combined_counter + (cm, cpu_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + vlib_increment_combined_counter + (cm, cpu_index, lbi1, 1, + vlib_buffer_length_in_chain (vm, b1)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->addr.ip4 = *input_addr0; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b1, sizeof (*tr)); + tr->fib_index = fib_index1; + tr->lbi = lbi1; + tr->addr.ip4 = *input_addr1; + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0, hash_c0; + flow_hash_config_t flow_hash_config0; + const ip4_address_t *input_addr; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const ip4_header_t * ip0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (b0); + + /* dst lookup was done by ip4 lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + ip4_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + } + + /* + * choose between a source or destination address lookup in the table + */ + if (input_src_addr) + { + input_addr = &ip0->src_address; + } + else + { + input_addr = &ip0->dst_address; + } + + /* do lookup */ + ip4_src_fib_lookup_one (fib_index0, input_addr, &lbi0); + lb0 = load_balance_get(lbi0); + + vnet_buffer(b0)->sw_if_index[VLIB_TX] = fib_index0; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (b0)->ip.flow_hash = 0; + + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + flow_hash_config0 = lb0->lb_hash_config; + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + ip4_compute_flow_hash (ip0, flow_hash_config0); + } + + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + + next0 = dpo0->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + vlib_increment_combined_counter + (cm, cpu_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->addr.ip4 = *input_addr; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_lookup_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + lookup_trace_t * t = va_arg (*args, lookup_trace_t *); + uword indent = format_get_indent (s); + s = format (s, "%U fib-index:%d addr:%U load-balance:%d", + format_white_space, indent, + t->fib_index, + format_ip46_address, &t->addr, IP46_TYPE_ANY, + t->lbi); + return s; +} + +always_inline uword +lookup_ip4_dst (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip4_inline(vm, node, from_frame, 0, 0)); +} + +VLIB_REGISTER_NODE (lookup_ip4_dst_node) = { + .function = lookup_ip4_dst, + .name = "lookup-ip4-dst", + .vector_size = sizeof (u32), + .sibling_of = "ip4-lookup", + .format_trace = format_lookup_trace, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_dst_node, lookup_ip4_dst) + +always_inline uword +lookup_ip4_dst_itf (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip4_inline(vm, node, from_frame, 0, 1)); +} + +VLIB_REGISTER_NODE (lookup_ip4_dst_itf_node) = { + .function = lookup_ip4_dst_itf, + .name = "lookup-ip4-dst-itf", + .vector_size = sizeof (u32), + .sibling_of = "ip4-lookup", + .format_trace = format_lookup_trace, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_dst_itf_node, lookup_ip4_dst_itf) + +always_inline uword +lookup_ip4_src (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip4_inline(vm, node, from_frame, 1, 0)); +} + +VLIB_REGISTER_NODE (lookup_ip4_src_node) = { + .function = lookup_ip4_src, + .name = "lookup-ip4-src", + .vector_size = sizeof (u32), + .format_trace = format_lookup_trace, + .sibling_of = "ip4-lookup", +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_src_node, lookup_ip4_src) + +always_inline uword +lookup_dpo_ip6_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + int input_src_addr, + int table_from_interface) +{ + vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; + u32 n_left_from, next_index, * from, * to_next; + u32 cpu_index = os_get_cpu_number(); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next > 2) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0, hash_c0; + flow_hash_config_t flow_hash_config0; + const ip6_address_t *input_addr0; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const ip6_header_t * ip0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + u32 bi1, lkdi1, lbi1, fib_index1, next1, hash_c1; + flow_hash_config_t flow_hash_config1; + const ip6_address_t *input_addr1; + const load_balance_t *lb1; + const lookup_dpo_t * lkd1; + const ip6_header_t * ip1; + const dpo_id_t *dpo1; + vlib_buffer_t * b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + bi0 = from[0]; + to_next[0] = bi0; + bi1 = from[1]; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (b0); + b1 = vlib_get_buffer (vm, bi1); + ip1 = vlib_buffer_get_current (b1); + + /* dst lookup was done by ip6 lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkdi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + lkd1 = lookup_dpo_get(lkdi1); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + ip6_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + fib_index1 = + ip6_fib_table_get_index_for_sw_if_index( + vnet_buffer(b1)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + fib_index1 = lkd1->lkd_fib_index; + } + + /* + * choose between a source or destination address lookup in the table + */ + if (input_src_addr) + { + input_addr0 = &ip0->src_address; + input_addr1 = &ip1->src_address; + } + else + { + input_addr0 = &ip0->dst_address; + input_addr1 = &ip1->dst_address; + } + + /* do src lookup */ + lbi0 = ip6_fib_table_fwding_lookup(&ip6_main, + fib_index0, + input_addr0); + lbi1 = ip6_fib_table_fwding_lookup(&ip6_main, + fib_index1, + input_addr1); + lb0 = load_balance_get(lbi0); + lb1 = load_balance_get(lbi1); + + vnet_buffer(b0)->sw_if_index[VLIB_TX] = fib_index0; + vnet_buffer(b1)->sw_if_index[VLIB_TX] = fib_index1; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (b0)->ip.flow_hash = 0; + hash_c1 = vnet_buffer (b1)->ip.flow_hash = 0; + + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + flow_hash_config0 = lb0->lb_hash_config; + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + ip6_compute_flow_hash (ip0, flow_hash_config0); + } + + if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) + { + flow_hash_config1 = lb1->lb_hash_config; + hash_c1 = vnet_buffer (b1)->ip.flow_hash = + ip6_compute_flow_hash (ip1, flow_hash_config1); + } + + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + dpo1 = load_balance_get_bucket_i(lb1, + (hash_c1 & + (lb1->lb_n_buckets_minus_1))); + + next0 = dpo0->dpoi_next_node; + next1 = dpo1->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; + + vlib_increment_combined_counter + (cm, cpu_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + vlib_increment_combined_counter + (cm, cpu_index, lbi1, 1, + vlib_buffer_length_in_chain (vm, b1)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->addr.ip6 = *input_addr0; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b1, sizeof (*tr)); + tr->fib_index = fib_index1; + tr->lbi = lbi1; + tr->addr.ip6 = *input_addr1; + } + vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, + n_left_to_next, bi0, bi1, + next0, next1); + } + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0, hash_c0; + flow_hash_config_t flow_hash_config0; + const ip6_address_t *input_addr0; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const ip6_header_t * ip0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (b0); + + /* dst lookup was done by ip6 lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + ip6_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + } + + /* + * choose between a source or destination address lookup in the table + */ + if (input_src_addr) + { + input_addr0 = &ip0->src_address; + } + else + { + input_addr0 = &ip0->dst_address; + } + + /* do src lookup */ + lbi0 = ip6_fib_table_fwding_lookup(&ip6_main, + fib_index0, + input_addr0); + lb0 = load_balance_get(lbi0); + + vnet_buffer(b0)->sw_if_index[VLIB_TX] = fib_index0; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (b0)->ip.flow_hash = 0; + + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + flow_hash_config0 = lb0->lb_hash_config; + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + ip6_compute_flow_hash (ip0, flow_hash_config0); + } + + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + + next0 = dpo0->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + vlib_increment_combined_counter + (cm, cpu_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->addr.ip6 = *input_addr0; + } + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +always_inline uword +lookup_ip6_dst (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip6_inline(vm, node, from_frame, 0 /*use src*/, 0)); +} + +VLIB_REGISTER_NODE (lookup_ip6_dst_node) = { + .function = lookup_ip6_dst, + .name = "lookup-ip6-dst", + .vector_size = sizeof (u32), + .format_trace = format_lookup_trace, + .sibling_of = "ip6-lookup", +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip6_dst_node, lookup_ip6_dst) + +always_inline uword +lookup_ip6_dst_itf (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip6_inline(vm, node, from_frame, 0 /*use src*/, 1)); +} + +VLIB_REGISTER_NODE (lookup_ip6_dst_itf_node) = { + .function = lookup_ip6_dst_itf, + .name = "lookup-ip6-dst-itf", + .vector_size = sizeof (u32), + .format_trace = format_lookup_trace, + .sibling_of = "ip6-lookup", +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip6_dst_itf_node, lookup_ip6_dst_itf) + +always_inline uword +lookup_ip6_src (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip6_inline(vm, node, from_frame, 1, 0)); +} + +VLIB_REGISTER_NODE (lookup_ip6_src_node) = { + .function = lookup_ip6_src, + .name = "lookup-ip6-src", + .vector_size = sizeof (u32), + .format_trace = format_lookup_trace, + .sibling_of = "ip6-lookup", +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip6_src_node, lookup_ip6_src) + +always_inline uword +lookup_dpo_mpls_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + int table_from_interface) +{ + u32 n_left_from, next_index, * from, * to_next; + u32 cpu_index = os_get_cpu_number(); + vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + /* while (n_left_from >= 4 && n_left_to_next >= 2) */ + /* } */ + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0; + const mpls_unicast_header_t * hdr0; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + hdr0 = vlib_buffer_get_current (b0); + + /* dst lookup was done by mpls lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + mpls_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + } + + /* do lookup */ + lbi0 = mpls_fib_table_forwarding_lookup (fib_index0, hdr0); + lb0 = load_balance_get(lbi0); + dpo0 = load_balance_get_bucket_i(lb0, 0); + + next0 = dpo0->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + vlib_increment_combined_counter + (cm, cpu_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->hdr = *hdr0; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_lookup_mpls_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + lookup_trace_t * t = va_arg (*args, lookup_trace_t *); + uword indent = format_get_indent (s); + mpls_unicast_header_t hdr; + + hdr.label_exp_s_ttl = clib_net_to_host_u32(t->hdr.label_exp_s_ttl); + + s = format (s, "%U fib-index:%d hdr:%U load-balance:%d", + format_white_space, indent, + t->fib_index, + format_mpls_header, hdr, + t->lbi); + return s; +} + +always_inline uword +lookup_mpls_dst (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_mpls_inline(vm, node, from_frame, 0)); +} + +VLIB_REGISTER_NODE (lookup_mpls_dst_node) = { + .function = lookup_mpls_dst, + .name = "lookup-mpls-dst", + .vector_size = sizeof (u32), + .sibling_of = "mpls-lookup", + .format_trace = format_lookup_mpls_trace, + .n_next_nodes = 0, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_mpls_dst_node, lookup_mpls_dst) + +always_inline uword +lookup_mpls_dst_itf (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_mpls_inline(vm, node, from_frame, 1)); +} + +VLIB_REGISTER_NODE (lookup_mpls_dst_itf_node) = { + .function = lookup_mpls_dst_itf, + .name = "lookup-mpls-dst-itf", + .vector_size = sizeof (u32), + .sibling_of = "mpls-lookup", + .format_trace = format_lookup_mpls_trace, + .n_next_nodes = 0, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_mpls_dst_itf_node, lookup_mpls_dst_itf) + +static void +lookup_dpo_mem_show (void) +{ + fib_show_memory_usage("Lookup", + pool_elts(lookup_dpo_pool), + pool_len(lookup_dpo_pool), + sizeof(lookup_dpo_t)); +} + +const static dpo_vft_t lkd_vft = { + .dv_lock = lookup_dpo_lock, + .dv_unlock = lookup_dpo_unlock, + .dv_format = format_lookup_dpo, +}; +const static dpo_vft_t lkd_vft_w_mem_show = { + .dv_lock = lookup_dpo_lock, + .dv_unlock = lookup_dpo_unlock, + .dv_format = format_lookup_dpo, + .dv_mem_show = lookup_dpo_mem_show, +}; + +const static char* const lookup_src_ip4_nodes[] = +{ + "lookup-ip4-src", + NULL, +}; +const static char* const lookup_src_ip6_nodes[] = +{ + "lookup-ip6-src", + NULL, +}; +const static char* const * const lookup_src_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = lookup_src_ip4_nodes, + [DPO_PROTO_IP6] = lookup_src_ip6_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + +const static char* const lookup_dst_ip4_nodes[] = +{ + "lookup-ip4-dst", + NULL, +}; +const static char* const lookup_dst_ip6_nodes[] = +{ + "lookup-ip6-dst", + NULL, +}; +const static char* const lookup_dst_mpls_nodes[] = +{ + "lookup-mpls-dst", + NULL, +}; +const static char* const * const lookup_dst_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = lookup_dst_ip4_nodes, + [DPO_PROTO_IP6] = lookup_dst_ip6_nodes, + [DPO_PROTO_MPLS] = lookup_dst_mpls_nodes, +}; + +const static char* const lookup_dst_from_interface_ip4_nodes[] = +{ + "lookup-ip4-dst-itf", + NULL, +}; +const static char* const lookup_dst_from_interface_ip6_nodes[] = +{ + "lookup-ip6-dst-itf", + NULL, +}; +const static char* const lookup_dst_from_interface_mpls_nodes[] = +{ + "lookup-mpls-dst-itf", + NULL, +}; +const static char* const * const lookup_dst_from_interface_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = lookup_dst_from_interface_ip4_nodes, + [DPO_PROTO_IP6] = lookup_dst_from_interface_ip6_nodes, + [DPO_PROTO_MPLS] = lookup_dst_from_interface_mpls_nodes, +}; + + +void +lookup_dpo_module_init (void) +{ + dpo_register(DPO_LOOKUP, &lkd_vft_w_mem_show, NULL); + + /* + * There are various sorts of lookup; src or dst addr v4 /v6 etc. + * there isn't an object type for each (there is only the lookup_dpo_t), + * but, for performance reasons, there is a data plane function, and hence + * VLIB node for each. VLIB graph node construction is based on DPO types + * so we create sub-types. + */ + lookup_dpo_sub_types[LOOKUP_SUB_TYPE_SRC] = + dpo_register_new_type(&lkd_vft, lookup_src_nodes); + lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST] = + dpo_register_new_type(&lkd_vft, lookup_dst_nodes); + lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE] = + dpo_register_new_type(&lkd_vft, lookup_dst_from_interface_nodes); +} diff --git a/src/vnet/dpo/lookup_dpo.h b/src/vnet/dpo/lookup_dpo.h new file mode 100644 index 00000000000..ff283388868 --- /dev/null +++ b/src/vnet/dpo/lookup_dpo.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LOOKUP_DPO_H__ +#define __LOOKUP_DPO_H__ + +#include <vnet/vnet.h> +#include <vnet/fib/fib_types.h> +#include <vnet/dpo/dpo.h> + +/** + * Switch to use the packet's source or destination address for lookup + */ +typedef enum lookup_input_t_ { + LOOKUP_INPUT_SRC_ADDR, + LOOKUP_INPUT_DST_ADDR, +} __attribute__ ((packed)) lookup_input_t; + +#define LOOKUP_INPUTS { \ + [LOOKUP_INPUT_SRC_ADDR] = "src-address", \ + [LOOKUP_INPUT_DST_ADDR] = "dst-address", \ +} + +/** + * Switch to use the packet's source or destination address for lookup + */ +typedef enum lookup_table_t_ { + LOOKUP_TABLE_FROM_INPUT_INTERFACE, + LOOKUP_TABLE_FROM_CONFIG, +} __attribute__ ((packed)) lookup_table_t; + +#define LOOKUP_TABLES { \ + [LOOKUP_INPUT_SRC_ADDR] = "table-input-interface", \ + [LOOKUP_INPUT_DST_ADDR] = "table-configured", \ +} + +/** + * A representation of an MPLS label for imposition in the data-path + */ +typedef struct lookup_dpo_t +{ + /** + * The FIB, or interface from which to get a FIB, in which to perform + * the next lookup; + */ + fib_node_index_t lkd_fib_index; + + /** + * The protocol of the FIB for the lookup, and hence + * the protocol of the packet + */ + dpo_proto_t lkd_proto; + + /** + * Switch to use src or dst address + */ + lookup_input_t lkd_input; + + /** + * Switch to use the table index passed, or the table of the input interface + */ + lookup_table_t lkd_table; + + /** + * Number of locks + */ + u16 lkd_locks; +} lookup_dpo_t; + +extern void lookup_dpo_add_or_lock_w_fib_index(fib_node_index_t fib_index, + dpo_proto_t proto, + lookup_input_t input, + lookup_table_t table, + dpo_id_t *dpo); +extern void lookup_dpo_add_or_lock_w_table_id(u32 table_id, + dpo_proto_t proto, + lookup_input_t input, + lookup_table_t table, + dpo_id_t *dpo); + +extern u8* format_lookup_dpo(u8 *s, va_list *args); + +/* + * Encapsulation violation for fast data-path access + */ +extern lookup_dpo_t *lookup_dpo_pool; + +static inline lookup_dpo_t * +lookup_dpo_get (index_t index) +{ + return (pool_elt_at_index(lookup_dpo_pool, index)); +} + +extern void lookup_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/mpls_label_dpo.c b/src/vnet/dpo/mpls_label_dpo.c new file mode 100644 index 00000000000..bbdc9666503 --- /dev/null +++ b/src/vnet/dpo/mpls_label_dpo.c @@ -0,0 +1,570 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/ip.h> +#include <vnet/dpo/mpls_label_dpo.h> +#include <vnet/mpls/mpls.h> + +/* + * pool of all MPLS Label DPOs + */ +mpls_label_dpo_t *mpls_label_dpo_pool; + +static mpls_label_dpo_t * +mpls_label_dpo_alloc (void) +{ + mpls_label_dpo_t *mld; + + pool_get_aligned(mpls_label_dpo_pool, mld, CLIB_CACHE_LINE_BYTES); + memset(mld, 0, sizeof(*mld)); + + dpo_reset(&mld->mld_dpo); + + return (mld); +} + +static index_t +mpls_label_dpo_get_index (mpls_label_dpo_t *mld) +{ + return (mld - mpls_label_dpo_pool); +} + +index_t +mpls_label_dpo_create (mpls_label_t *label_stack, + mpls_eos_bit_t eos, + u8 ttl, + u8 exp, + dpo_proto_t payload_proto, + const dpo_id_t *dpo) +{ + mpls_label_dpo_t *mld; + u32 ii; + + mld = mpls_label_dpo_alloc(); + mld->mld_n_labels = vec_len(label_stack); + mld->mld_n_hdr_bytes = mld->mld_n_labels * sizeof(mld->mld_hdr[0]); + mld->mld_payload_proto = payload_proto; + + /* + * construct label rewrite headers for each value value passed. + * get the header in network byte order since we will paint it + * on a packet in the data-plane + */ + + for (ii = 0; ii < mld->mld_n_labels-1; ii++) + { + vnet_mpls_uc_set_label(&mld->mld_hdr[ii].label_exp_s_ttl, label_stack[ii]); + vnet_mpls_uc_set_ttl(&mld->mld_hdr[ii].label_exp_s_ttl, 255); + vnet_mpls_uc_set_exp(&mld->mld_hdr[ii].label_exp_s_ttl, 0); + vnet_mpls_uc_set_s(&mld->mld_hdr[ii].label_exp_s_ttl, MPLS_NON_EOS); + mld->mld_hdr[ii].label_exp_s_ttl = + clib_host_to_net_u32(mld->mld_hdr[ii].label_exp_s_ttl); + } + + /* + * the inner most label + */ + ii = mld->mld_n_labels-1; + + vnet_mpls_uc_set_label(&mld->mld_hdr[ii].label_exp_s_ttl, label_stack[ii]); + vnet_mpls_uc_set_ttl(&mld->mld_hdr[ii].label_exp_s_ttl, ttl); + vnet_mpls_uc_set_exp(&mld->mld_hdr[ii].label_exp_s_ttl, exp); + vnet_mpls_uc_set_s(&mld->mld_hdr[ii].label_exp_s_ttl, eos); + mld->mld_hdr[ii].label_exp_s_ttl = + clib_host_to_net_u32(mld->mld_hdr[ii].label_exp_s_ttl); + + /* + * stack this label objct on its parent. + */ + dpo_stack(DPO_MPLS_LABEL, + mld->mld_payload_proto, + &mld->mld_dpo, + dpo); + + return (mpls_label_dpo_get_index(mld)); +} + +u8* +format_mpls_label_dpo (u8 *s, va_list *args) +{ + index_t index = va_arg (*args, index_t); + u32 indent = va_arg (*args, u32); + mpls_unicast_header_t hdr; + mpls_label_dpo_t *mld; + u32 ii; + + mld = mpls_label_dpo_get(index); + + s = format(s, "mpls-label:[%d]:", index); + + for (ii = 0; ii < mld->mld_n_labels; ii++) + { + hdr.label_exp_s_ttl = + clib_net_to_host_u32(mld->mld_hdr[ii].label_exp_s_ttl); + s = format(s, "%U", format_mpls_header, hdr); + } + + s = format(s, "\n%U", format_white_space, indent); + s = format(s, "%U", format_dpo_id, &mld->mld_dpo, indent+2); + + return (s); +} + +static void +mpls_label_dpo_lock (dpo_id_t *dpo) +{ + mpls_label_dpo_t *mld; + + mld = mpls_label_dpo_get(dpo->dpoi_index); + + mld->mld_locks++; +} + +static void +mpls_label_dpo_unlock (dpo_id_t *dpo) +{ + mpls_label_dpo_t *mld; + + mld = mpls_label_dpo_get(dpo->dpoi_index); + + mld->mld_locks--; + + if (0 == mld->mld_locks) + { + dpo_reset(&mld->mld_dpo); + pool_put(mpls_label_dpo_pool, mld); + } +} + +/** + * @brief A struct to hold tracing information for the MPLS label imposition + * node. + */ +typedef struct mpls_label_imposition_trace_t_ +{ + /** + * The MPLS header imposed + */ + mpls_unicast_header_t hdr; +} mpls_label_imposition_trace_t; + +always_inline uword +mpls_label_imposition_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + u8 payload_is_ip4, + u8 payload_is_ip6) +{ + u32 n_left_from, next_index, * from, * to_next; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + mpls_unicast_header_t *hdr0, *hdr1; + mpls_label_dpo_t *mld0, *mld1; + u32 bi0, mldi0, bi1, mldi1; + vlib_buffer_t * b0, *b1; + u32 next0, next1; + u8 ttl0, ttl1; + + bi0 = to_next[0] = from[0]; + bi1 = to_next[1] = from[1]; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, STORE); + vlib_prefetch_buffer_header (p3, STORE); + + CLIB_PREFETCH (p2->data, sizeof (hdr0[0]), STORE); + CLIB_PREFETCH (p3->data, sizeof (hdr0[0]), STORE); + } + + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* dst lookup was done by ip4 lookup */ + mldi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + mldi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + mld0 = mpls_label_dpo_get(mldi0); + mld1 = mpls_label_dpo_get(mldi1); + + if (payload_is_ip4) + { + /* + * decrement the TTL on ingress to the LSP + */ + ip4_header_t * ip0 = vlib_buffer_get_current(b0); + ip4_header_t * ip1 = vlib_buffer_get_current(b1); + u32 checksum0; + u32 checksum1; + + checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100); + checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100); + + checksum0 += checksum0 >= 0xffff; + checksum1 += checksum1 >= 0xffff; + + ip0->checksum = checksum0; + ip1->checksum = checksum1; + + ip0->ttl -= 1; + ip1->ttl -= 1; + + ttl1 = ip1->ttl; + ttl0 = ip0->ttl; + } + else if (payload_is_ip6) + { + /* + * decrement the TTL on ingress to the LSP + */ + ip6_header_t * ip0 = vlib_buffer_get_current(b0); + ip6_header_t * ip1 = vlib_buffer_get_current(b1); + + + ip0->hop_limit -= 1; + ip1->hop_limit -= 1; + + ttl0 = ip0->hop_limit; + ttl1 = ip1->hop_limit; + } + else + { + /* + * else, the packet to be encapped is an MPLS packet + */ + if (PREDICT_TRUE(vnet_buffer(b0)->mpls.first)) + { + /* + * The first label to be imposed on the packet. this is a label swap. + * in which case we stashed the TTL and EXP bits in the + * packet in the lookup node + */ + ASSERT(0 != vnet_buffer (b0)->mpls.ttl); + + ttl0 = vnet_buffer(b0)->mpls.ttl - 1; + } + else + { + /* + * not the first label. implying we are recusring down a chain of + * output labels. + * Each layer is considered a new LSP - hence the TTL is reset. + */ + ttl0 = 255; + } + if (PREDICT_TRUE(vnet_buffer(b1)->mpls.first)) + { + ASSERT(1 != vnet_buffer (b1)->mpls.ttl); + ttl1 = vnet_buffer(b1)->mpls.ttl - 1; + } + else + { + ttl1 = 255; + } + } + vnet_buffer(b0)->mpls.first = 0; + vnet_buffer(b1)->mpls.first = 0; + + /* Paint the MPLS header */ + vlib_buffer_advance(b0, -(mld0->mld_n_hdr_bytes)); + vlib_buffer_advance(b1, -(mld1->mld_n_hdr_bytes)); + + hdr0 = vlib_buffer_get_current(b0); + hdr1 = vlib_buffer_get_current(b1); + + clib_memcpy(hdr0, mld0->mld_hdr, mld0->mld_n_hdr_bytes); + clib_memcpy(hdr1, mld1->mld_hdr, mld1->mld_n_hdr_bytes); + + /* fixup the TTL for the inner most label */ + hdr0 = hdr0 + (mld0->mld_n_labels - 1); + hdr1 = hdr1 + (mld1->mld_n_labels - 1); + ((char*)hdr0)[3] = ttl0; + ((char*)hdr1)[3] = ttl1; + + next0 = mld0->mld_dpo.dpoi_next_node; + next1 = mld1->mld_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mld0->mld_dpo.dpoi_index; + vnet_buffer(b1)->ip.adj_index[VLIB_TX] = mld1->mld_dpo.dpoi_index; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_imposition_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->hdr = *hdr0; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_imposition_trace_t *tr = + vlib_add_trace (vm, node, b1, sizeof (*tr)); + tr->hdr = *hdr1; + } + + vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, + n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + mpls_unicast_header_t *hdr0; + mpls_label_dpo_t *mld0; + vlib_buffer_t * b0; + u32 bi0, mldi0; + u32 next0; + u8 ttl; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* dst lookup was done by ip4 lookup */ + mldi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + mld0 = mpls_label_dpo_get(mldi0); + + if (payload_is_ip4) + { + /* + * decrement the TTL on ingress to the LSP + */ + ip4_header_t * ip0 = vlib_buffer_get_current(b0); + u32 checksum0; + + checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100); + checksum0 += checksum0 >= 0xffff; + + ip0->checksum = checksum0; + ip0->ttl -= 1; + ttl = ip0->ttl; + } + else if (payload_is_ip6) + { + /* + * decrement the TTL on ingress to the LSP + */ + ip6_header_t * ip0 = vlib_buffer_get_current(b0); + + ip0->hop_limit -= 1; + ttl = ip0->hop_limit; + } + else + { + /* + * else, the packet to be encapped is an MPLS packet + */ + if (vnet_buffer(b0)->mpls.first) + { + /* + * The first label to be imposed on the packet. this is a label swap. + * in which case we stashed the TTL and EXP bits in the + * packet in the lookup node + */ + ASSERT(0 != vnet_buffer (b0)->mpls.ttl); + + ttl = vnet_buffer(b0)->mpls.ttl - 1; + } + else + { + /* + * not the first label. implying we are recusring down a chain of + * output labels. + * Each layer is considered a new LSP - hence the TTL is reset. + */ + ttl = 255; + } + } + vnet_buffer(b0)->mpls.first = 0; + + /* Paint the MPLS header */ + vlib_buffer_advance(b0, -(mld0->mld_n_hdr_bytes)); + hdr0 = vlib_buffer_get_current(b0); + clib_memcpy(hdr0, mld0->mld_hdr, mld0->mld_n_hdr_bytes); + + /* fixup the TTL for the inner most label */ + hdr0 = hdr0 + (mld0->mld_n_labels - 1); + ((char*)hdr0)[3] = ttl; + + next0 = mld0->mld_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mld0->mld_dpo.dpoi_index; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_imposition_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->hdr = *hdr0; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_mpls_label_imposition_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + mpls_label_imposition_trace_t * t; + mpls_unicast_header_t hdr; + uword indent; + + t = va_arg (*args, mpls_label_imposition_trace_t *); + indent = format_get_indent (s); + hdr.label_exp_s_ttl = clib_net_to_host_u32(t->hdr.label_exp_s_ttl); + + s = format (s, "%Umpls-header:%U", + format_white_space, indent, + format_mpls_header, hdr); + return (s); +} + +static uword +mpls_label_imposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_imposition_inline(vm, node, frame, 0, 0)); +} + +VLIB_REGISTER_NODE (mpls_label_imposition_node) = { + .function = mpls_label_imposition, + .name = "mpls-label-imposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_imposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (mpls_label_imposition_node, + mpls_label_imposition) + +static uword +ip4_mpls_label_imposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_imposition_inline(vm, node, frame, 1, 0)); +} + +VLIB_REGISTER_NODE (ip4_mpls_label_imposition_node) = { + .function = ip4_mpls_label_imposition, + .name = "ip4-mpls-label-imposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_imposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (ip4_mpls_label_imposition_node, + ip4_mpls_label_imposition) + +static uword +ip6_mpls_label_imposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_imposition_inline(vm, node, frame, 0, 1)); +} + +VLIB_REGISTER_NODE (ip6_mpls_label_imposition_node) = { + .function = ip6_mpls_label_imposition, + .name = "ip6-mpls-label-imposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_imposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (ip6_mpls_label_imposition_node, + ip6_mpls_label_imposition) + +static void +mpls_label_dpo_mem_show (void) +{ + fib_show_memory_usage("MPLS label", + pool_elts(mpls_label_dpo_pool), + pool_len(mpls_label_dpo_pool), + sizeof(mpls_label_dpo_t)); +} + +const static dpo_vft_t mld_vft = { + .dv_lock = mpls_label_dpo_lock, + .dv_unlock = mpls_label_dpo_unlock, + .dv_format = format_mpls_label_dpo, + .dv_mem_show = mpls_label_dpo_mem_show, +}; + +const static char* const mpls_label_imp_ip4_nodes[] = +{ + "ip4-mpls-label-imposition", + NULL, +}; +const static char* const mpls_label_imp_ip6_nodes[] = +{ + "ip6-mpls-label-imposition", + NULL, +}; +const static char* const mpls_label_imp_mpls_nodes[] = +{ + "mpls-label-imposition", + NULL, +}; +const static char* const * const mpls_label_imp_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = mpls_label_imp_ip4_nodes, + [DPO_PROTO_IP6] = mpls_label_imp_ip6_nodes, + [DPO_PROTO_MPLS] = mpls_label_imp_mpls_nodes, +}; + + +void +mpls_label_dpo_module_init (void) +{ + dpo_register(DPO_MPLS_LABEL, &mld_vft, mpls_label_imp_nodes); +} diff --git a/src/vnet/dpo/mpls_label_dpo.h b/src/vnet/dpo/mpls_label_dpo.h new file mode 100644 index 00000000000..89bcb093b04 --- /dev/null +++ b/src/vnet/dpo/mpls_label_dpo.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __MPLS_LABEL_DPO_H__ +#define __MPLS_LABEL_DPO_H__ + +#include <vnet/vnet.h> +#include <vnet/mpls/packet.h> +#include <vnet/dpo/dpo.h> + +/** + * A representation of an MPLS label for imposition in the data-path + */ +typedef struct mpls_label_dpo_t +{ + /** + * The MPLS label header to impose. Outer most label first. + */ + mpls_unicast_header_t mld_hdr[8]; + + /** + * Next DPO in the graph + */ + dpo_id_t mld_dpo; + + /** + * The protocol of the payload/packets that are being encapped + */ + dpo_proto_t mld_payload_proto; + + /** + * Size of the label stack + */ + u16 mld_n_labels; + + /** + * Cached amount of header bytes to paint + */ + u16 mld_n_hdr_bytes; + + /** + * Number of locks/users of the label + */ + u16 mld_locks; +} mpls_label_dpo_t; + +/** + * @brief Assert that the MPLS label object is less than a cache line in size. + * Should this get any bigger then we will need to reconsider how many labels + * can be pushed in one object. + */ +_Static_assert((sizeof(mpls_label_dpo_t) <= CLIB_CACHE_LINE_BYTES), + "MPLS label DPO is larger than one cache line."); + +/** + * @brief Create an MPLS label object + * + * @param label_stack The stack if labels to impose, outer most label first + * @param eos The inner most label's EOS bit + * @param ttl The inner most label's TTL bit + * @param exp The inner most label's EXP bit + * @param payload_proto The ptocool of the payload packets that will + * be imposed with this label header. + * @param dpo The parent of the created MPLS label object + */ +extern index_t mpls_label_dpo_create(mpls_label_t *label_stack, + mpls_eos_bit_t eos, + u8 ttl, + u8 exp, + dpo_proto_t payload_proto, + const dpo_id_t *dpo); + +extern u8* format_mpls_label_dpo(u8 *s, va_list *args); + + +/* + * Encapsulation violation for fast data-path access + */ +extern mpls_label_dpo_t *mpls_label_dpo_pool; + +static inline mpls_label_dpo_t * +mpls_label_dpo_get (index_t index) +{ + return (pool_elt_at_index(mpls_label_dpo_pool, index)); +} + +extern void mpls_label_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/punt_dpo.c b/src/vnet/dpo/punt_dpo.c new file mode 100644 index 00000000000..d1661dcc8e0 --- /dev/null +++ b/src/vnet/dpo/punt_dpo.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing puntping the packet + */ + +#include <vnet/dpo/dpo.h> + +static dpo_id_t punt_dpos[DPO_PROTO_NUM]; + +const dpo_id_t * +punt_dpo_get (dpo_proto_t proto) +{ + dpo_set(&punt_dpos[proto], DPO_PUNT, proto, 1); + + return (&punt_dpos[proto]); +} + +int +dpo_is_punt (const dpo_id_t *dpo) +{ + return (dpo->dpoi_type == DPO_PUNT); +} + +static void +punt_dpo_lock (dpo_id_t *dpo) +{ + /* + * not maintaining a lock count on the punt + * more trouble than it's worth. + * There always needs to be one around. no point it managaing its lifetime + */ +} +static void +punt_dpo_unlock (dpo_id_t *dpo) +{ +} + +static u8* +format_punt_dpo (u8 *s, va_list *ap) +{ + CLIB_UNUSED(index_t index) = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + + return (format(s, "dpo-punt")); +} + +const static dpo_vft_t punt_vft = { + .dv_lock = punt_dpo_lock, + .dv_unlock = punt_dpo_unlock, + .dv_format = format_punt_dpo, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a punt + * object. + * + * this means that these graph nodes are ones from which a punt is the + * parent object in the DPO-graph. + */ +const static char* const punt_ip4_nodes[] = +{ + "ip4-punt", + NULL, +}; +const static char* const punt_ip6_nodes[] = +{ + "ip6-punt", + NULL, +}; +const static char* const punt_mpls_nodes[] = +{ + "mpls-punt", + NULL, +}; +const static char* const * const punt_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = punt_ip4_nodes, + [DPO_PROTO_IP6] = punt_ip6_nodes, + [DPO_PROTO_MPLS] = punt_mpls_nodes, +}; + +void +punt_dpo_module_init (void) +{ + dpo_register(DPO_PUNT, &punt_vft, punt_nodes); +} diff --git a/src/vnet/dpo/punt_dpo.h b/src/vnet/dpo/punt_dpo.h new file mode 100644 index 00000000000..370547c1596 --- /dev/null +++ b/src/vnet/dpo/punt_dpo.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief A DPO to punt packets to the Control-plane + */ + +#ifndef __PUNT_DPO_H__ +#define __PUNT_DPO_H__ + +#include <vnet/dpo/dpo.h> + +extern int dpo_is_punt(const dpo_id_t *dpo); + +extern const dpo_id_t *punt_dpo_get(dpo_proto_t proto); + +extern void punt_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/receive_dpo.c b/src/vnet/dpo/receive_dpo.c new file mode 100644 index 00000000000..2b2571c6c83 --- /dev/null +++ b/src/vnet/dpo/receive_dpo.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing receiveing the packet, i.e. it's for-us + */ +#include <vlib/vlib.h> +#include <vnet/ip/ip.h> +#include <vnet/dpo/receive_dpo.h> + +/** + * @brief pool of all receive DPOs + */ +receive_dpo_t *receive_dpo_pool; + +static receive_dpo_t * +receive_dpo_alloc (void) +{ + receive_dpo_t *rd; + + pool_get_aligned(receive_dpo_pool, rd, CLIB_CACHE_LINE_BYTES); + memset(rd, 0, sizeof(*rd)); + + return (rd); +} + +static receive_dpo_t * +receive_dpo_get_from_dpo (const dpo_id_t *dpo) +{ + ASSERT(DPO_RECEIVE == dpo->dpoi_type); + + return (receive_dpo_get(dpo->dpoi_index)); +} + + +/* + * receive_dpo_add_or_lock + * + * The next_hop address here is used for source address selection in the DP. + * The local adj is added to an interface's receive prefix, the next-hop + * passed here is the local prefix on the same interface. + */ +void +receive_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + const ip46_address_t *nh_addr, + dpo_id_t *dpo) +{ + receive_dpo_t *rd; + + rd = receive_dpo_alloc(); + + rd->rd_sw_if_index = sw_if_index; + if (NULL != nh_addr) + { + rd->rd_addr = *nh_addr; + } + + dpo_set(dpo, DPO_RECEIVE, proto, (rd - receive_dpo_pool)); +} + +static void +receive_dpo_lock (dpo_id_t *dpo) +{ + receive_dpo_t *rd; + + rd = receive_dpo_get_from_dpo(dpo); + rd->rd_locks++; +} + +static void +receive_dpo_unlock (dpo_id_t *dpo) +{ + receive_dpo_t *rd; + + rd = receive_dpo_get_from_dpo(dpo); + rd->rd_locks--; + + if (0 == rd->rd_locks) + { + pool_put(receive_dpo_pool, rd); + } +} + +static u8* +format_receive_dpo (u8 *s, va_list *ap) +{ + CLIB_UNUSED(index_t index) = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + vnet_main_t * vnm = vnet_get_main(); + receive_dpo_t *rd; + + rd = receive_dpo_get(index); + + if (~0 != rd->rd_sw_if_index) + { + return (format(s, "dpo-receive: %U on %U", + format_ip46_address, &rd->rd_addr, IP46_TYPE_ANY, + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface(vnm, rd->rd_sw_if_index))); + } + else + { + return (format(s, "dpo-receive")); + } +} + +static void +receive_dpo_mem_show (void) +{ + fib_show_memory_usage("Receive", + pool_elts(receive_dpo_pool), + pool_len(receive_dpo_pool), + sizeof(receive_dpo_t)); +} + +const static dpo_vft_t receive_vft = { + .dv_lock = receive_dpo_lock, + .dv_unlock = receive_dpo_unlock, + .dv_format = format_receive_dpo, + .dv_mem_show = receive_dpo_mem_show, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a receive + * object. + * + * this means that these graph nodes are ones from which a receive is the + * parent object in the DPO-graph. + */ +const static char* const receive_ip4_nodes[] = +{ + "ip4-local", + NULL, +}; +const static char* const receive_ip6_nodes[] = +{ + "ip6-local", + NULL, +}; + +const static char* const * const receive_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = receive_ip4_nodes, + [DPO_PROTO_IP6] = receive_ip6_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + +void +receive_dpo_module_init (void) +{ + dpo_register(DPO_RECEIVE, &receive_vft, receive_nodes); +} diff --git a/src/vnet/dpo/receive_dpo.h b/src/vnet/dpo/receive_dpo.h new file mode 100644 index 00000000000..2420fd7843c --- /dev/null +++ b/src/vnet/dpo/receive_dpo.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing receiveing the packet, i.e. it's for-us + */ + +#ifndef __RECEIVE_DPO_H__ +#define __RECEIVE_DPO_H__ + +#include <vnet/dpo/dpo.h> +#include <vnet/ip/ip6.h> + +typedef struct receive_dpo_t_ +{ + /** + * The Software interface index on which traffic is received + */ + u32 rd_sw_if_index; + + /** + * The address on the receive interface. packet are destined to this address + */ + ip46_address_t rd_addr; + + /** + * number oflocks. + */ + u16 rd_locks; +} receive_dpo_t; + +extern void receive_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + const ip46_address_t *nh_addr, + dpo_id_t *dpo); + +extern void receive_dpo_module_init(void); + +/** + * @brief pool of all receive DPOs + */ +receive_dpo_t *receive_dpo_pool; + +static inline receive_dpo_t * +receive_dpo_get (index_t index) +{ + return (pool_elt_at_index(receive_dpo_pool, index)); +} + +#endif |