diff options
Diffstat (limited to 'src/vnet/dpo')
28 files changed, 8579 insertions, 0 deletions
diff --git a/src/vnet/dpo/classify_dpo.c b/src/vnet/dpo/classify_dpo.c new file mode 100644 index 00000000..9e7886c9 --- /dev/null +++ b/src/vnet/dpo/classify_dpo.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/ip.h> +#include <vnet/dpo/classify_dpo.h> +#include <vnet/mpls/mpls.h> + +/* + * pool of all MPLS Label DPOs + */ +classify_dpo_t *classify_dpo_pool; + +static classify_dpo_t * +classify_dpo_alloc (void) +{ + classify_dpo_t *cd; + + pool_get_aligned(classify_dpo_pool, cd, CLIB_CACHE_LINE_BYTES); + memset(cd, 0, sizeof(*cd)); + + return (cd); +} + +static index_t +classify_dpo_get_index (classify_dpo_t *cd) +{ + return (cd - classify_dpo_pool); +} + +index_t +classify_dpo_create (dpo_proto_t proto, + u32 classify_table_index) +{ + classify_dpo_t *cd; + + cd = classify_dpo_alloc(); + cd->cd_proto = proto; + cd->cd_table_index = classify_table_index; + + return (classify_dpo_get_index(cd)); +} + +u8* +format_classify_dpo (u8 *s, va_list *args) +{ + index_t index = va_arg (*args, index_t); + CLIB_UNUSED(u32 indent) = va_arg (*args, u32); + classify_dpo_t *cd; + + cd = classify_dpo_get(index); + + return (format(s, "%U-classify:[%d]:table:%d", + format_dpo_proto, cd->cd_proto, + index, cd->cd_table_index)); +} + +static void +classify_dpo_lock (dpo_id_t *dpo) +{ + classify_dpo_t *cd; + + cd = classify_dpo_get(dpo->dpoi_index); + + cd->cd_locks++; +} + +static void +classify_dpo_unlock (dpo_id_t *dpo) +{ + classify_dpo_t *cd; + + cd = classify_dpo_get(dpo->dpoi_index); + + cd->cd_locks--; + + if (0 == cd->cd_locks) + { + pool_put(classify_dpo_pool, cd); + } +} + +static void +classify_dpo_mem_show (void) +{ + fib_show_memory_usage("Classify", + pool_elts(classify_dpo_pool), + pool_len(classify_dpo_pool), + sizeof(classify_dpo_t)); +} + +const static dpo_vft_t cd_vft = { + .dv_lock = classify_dpo_lock, + .dv_unlock = classify_dpo_unlock, + .dv_format = format_classify_dpo, + .dv_mem_show = classify_dpo_mem_show, +}; + +const static char* const classify_ip4_nodes[] = +{ + "ip4-classify", + NULL, +}; +const static char* const classify_ip6_nodes[] = +{ + "ip6-classify", + NULL, +}; +const static char* const * const classify_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = classify_ip4_nodes, + [DPO_PROTO_IP6] = classify_ip6_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + +void +classify_dpo_module_init (void) +{ + dpo_register(DPO_CLASSIFY, &cd_vft, classify_nodes); +} diff --git a/src/vnet/dpo/classify_dpo.h b/src/vnet/dpo/classify_dpo.h new file mode 100644 index 00000000..48f4b2bf --- /dev/null +++ b/src/vnet/dpo/classify_dpo.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CLASSIFY_DPO_H__ +#define __CLASSIFY_DPO_H__ + +#include <vnet/vnet.h> +#include <vnet/mpls/packet.h> +#include <vnet/dpo/dpo.h> + +/** + * A representation of an MPLS label for imposition in the data-path + */ +typedef struct classify_dpo_t +{ + dpo_proto_t cd_proto; + + u32 cd_table_index; + + /** + * Number of locks/users of the label + */ + u16 cd_locks; +} classify_dpo_t; + +extern index_t classify_dpo_create(dpo_proto_t proto, + u32 classify_table_index); + +extern u8* format_classify_dpo(u8 *s, va_list *args); + +/* + * Encapsulation violation for fast data-path access + */ +extern classify_dpo_t *classify_dpo_pool; + +static inline classify_dpo_t * +classify_dpo_get (index_t index) +{ + return (pool_elt_at_index(classify_dpo_pool, index)); +} + +extern void classify_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/dpo.c b/src/vnet/dpo/dpo.c new file mode 100644 index 00000000..bd18b66b --- /dev/null +++ b/src/vnet/dpo/dpo.c @@ -0,0 +1,574 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * A Data-Path Object is an object that represents actions that are + * applied to packets are they are switched through VPP. + * + * The DPO is a base class that is specialised by other objects to provide + * concreate actions + * + * The VLIB graph nodes are graph of types, the DPO graph is a graph of instances. + */ + +#include <vnet/dpo/dpo.h> +#include <vnet/ip/lookup.h> +#include <vnet/ip/format.h> +#include <vnet/adj/adj.h> + +#include <vnet/dpo/load_balance.h> +#include <vnet/dpo/mpls_label_dpo.h> +#include <vnet/dpo/lookup_dpo.h> +#include <vnet/dpo/drop_dpo.h> +#include <vnet/dpo/receive_dpo.h> +#include <vnet/dpo/punt_dpo.h> +#include <vnet/dpo/classify_dpo.h> +#include <vnet/dpo/ip_null_dpo.h> +#include <vnet/dpo/replicate_dpo.h> +#include <vnet/dpo/interface_rx_dpo.h> +#include <vnet/dpo/interface_tx_dpo.h> +#include <vnet/dpo/mpls_disposition.h> + +/** + * Array of char* names for the DPO types and protos + */ +static const char* dpo_type_names[] = DPO_TYPES; +static const char* dpo_proto_names[] = DPO_PROTOS; + +/** + * @brief Vector of virtual function tables for the DPO types + * + * This is a vector so we can dynamically register new DPO types in plugins. + */ +static dpo_vft_t *dpo_vfts; + +/** + * @brief vector of graph node names associated with each DPO type and protocol. + * + * dpo_nodes[child_type][child_proto][node_X] = node_name; + * i.e. + * dpo_node[DPO_LOAD_BALANCE][DPO_PROTO_IP4][0] = "ip4-lookup" + * dpo_node[DPO_LOAD_BALANCE][DPO_PROTO_IP4][1] = "ip4-load-balance" + * + * This is a vector so we can dynamically register new DPO types in plugins. + */ +static const char* const * const ** dpo_nodes; + +/** + * @brief Vector of edge indicies from parent DPO nodes to child + * + * dpo_edges[child_type][child_proto][parent_type][parent_proto] = edge_index + * + * This array is derived at init time from the dpo_nodes above. Note that + * the third dimension in dpo_nodes is lost, hence, the edge index from each + * node MUST be the same. + * Including both the child and parent protocol is required to support the + * case where it changes as the grapth is traversed, most notablly when an + * MPLS label is popped. + * + * Note that this array is child type specific, not child instance specific. + */ +static u32 ****dpo_edges; + +/** + * @brief The DPO type value that can be assigend to the next dynamic + * type registration. + */ +static dpo_type_t dpo_dynamic = DPO_LAST; + +dpo_proto_t +vnet_link_to_dpo_proto (vnet_link_t linkt) +{ + switch (linkt) + { + case VNET_LINK_IP6: + return (DPO_PROTO_IP6); + case VNET_LINK_IP4: + return (DPO_PROTO_IP4); + case VNET_LINK_MPLS: + return (DPO_PROTO_MPLS); + case VNET_LINK_ETHERNET: + return (DPO_PROTO_ETHERNET); + case VNET_LINK_NSH: + return (DPO_PROTO_NSH); + case VNET_LINK_ARP: + break; + } + ASSERT(0); + return (0); +} + +vnet_link_t +dpo_proto_to_link (dpo_proto_t dp) +{ + switch (dp) + { + case DPO_PROTO_IP6: + return (VNET_LINK_IP6); + case DPO_PROTO_IP4: + return (VNET_LINK_IP4); + case DPO_PROTO_MPLS: + return (VNET_LINK_MPLS); + case DPO_PROTO_ETHERNET: + return (VNET_LINK_ETHERNET); + case DPO_PROTO_NSH: + return (VNET_LINK_NSH); + } + return (~0); +} + +u8 * +format_dpo_type (u8 * s, va_list * args) +{ + dpo_type_t type = va_arg (*args, int); + + s = format(s, "%s", dpo_type_names[type]); + + return (s); +} + +u8 * +format_dpo_id (u8 * s, va_list * args) +{ + dpo_id_t *dpo = va_arg (*args, dpo_id_t*); + u32 indent = va_arg (*args, u32); + + s = format(s, "[@%d]: ", dpo->dpoi_next_node); + + if (NULL != dpo_vfts[dpo->dpoi_type].dv_format) + { + return (format(s, "%U", + dpo_vfts[dpo->dpoi_type].dv_format, + dpo->dpoi_index, + indent)); + } + + switch (dpo->dpoi_type) + { + case DPO_FIRST: + s = format(s, "unset"); + break; + default: + s = format(s, "unknown"); + break; + } + return (s); +} + +u8 * +format_dpo_proto (u8 * s, va_list * args) +{ + dpo_proto_t proto = va_arg (*args, int); + + return (format(s, "%s", dpo_proto_names[proto])); +} + +void +dpo_set (dpo_id_t *dpo, + dpo_type_t type, + dpo_proto_t proto, + index_t index) +{ + dpo_id_t tmp = *dpo; + + dpo->dpoi_type = type; + dpo->dpoi_proto = proto, + dpo->dpoi_index = index; + + if (DPO_ADJACENCY == type) + { + /* + * set the adj subtype + */ + ip_adjacency_t *adj; + + adj = adj_get(index); + + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_ARP: + dpo->dpoi_type = DPO_ADJACENCY_INCOMPLETE; + break; + case IP_LOOKUP_NEXT_MIDCHAIN: + dpo->dpoi_type = DPO_ADJACENCY_MIDCHAIN; + break; + case IP_LOOKUP_NEXT_MCAST_MIDCHAIN: + dpo->dpoi_type = DPO_ADJACENCY_MCAST_MIDCHAIN; + break; + case IP_LOOKUP_NEXT_MCAST: + dpo->dpoi_type = DPO_ADJACENCY_MCAST; + break; + case IP_LOOKUP_NEXT_GLEAN: + dpo->dpoi_type = DPO_ADJACENCY_GLEAN; + break; + default: + break; + } + } + dpo_lock(dpo); + dpo_unlock(&tmp); +} + +void +dpo_reset (dpo_id_t *dpo) +{ + dpo_id_t tmp = DPO_INVALID; + + /* + * use the atomic copy operation. + */ + dpo_copy(dpo, &tmp); +} + +/** + * \brief + * Compare two Data-path objects + * + * like memcmp, return 0 is matching, !0 otherwise. + */ +int +dpo_cmp (const dpo_id_t *dpo1, + const dpo_id_t *dpo2) +{ + int res; + + res = dpo1->dpoi_type - dpo2->dpoi_type; + + if (0 != res) return (res); + + return (dpo1->dpoi_index - dpo2->dpoi_index); +} + +void +dpo_copy (dpo_id_t *dst, + const dpo_id_t *src) +{ + dpo_id_t tmp = *dst; + + /* + * the destination is written in a single u64 write - hence atomically w.r.t + * any packets inflight. + */ + *((u64*)dst) = *(u64*)src; + + dpo_lock(dst); + dpo_unlock(&tmp); +} + +int +dpo_is_adj (const dpo_id_t *dpo) +{ + return ((dpo->dpoi_type == DPO_ADJACENCY) || + (dpo->dpoi_type == DPO_ADJACENCY_INCOMPLETE) || + (dpo->dpoi_type == DPO_ADJACENCY_MIDCHAIN) || + (dpo->dpoi_type == DPO_ADJACENCY_GLEAN)); +} + +static u32 * +dpo_default_get_next_node (const dpo_id_t *dpo) +{ + u32 *node_indices = NULL; + const char *node_name; + u32 ii = 0; + + node_name = dpo_nodes[dpo->dpoi_type][dpo->dpoi_proto][ii]; + while (NULL != node_name) + { + vlib_node_t *node; + + node = vlib_get_node_by_name(vlib_get_main(), (u8*) node_name); + ASSERT(NULL != node); + vec_add1(node_indices, node->index); + + ++ii; + node_name = dpo_nodes[dpo->dpoi_type][dpo->dpoi_proto][ii]; + } + + return (node_indices); +} + +void +dpo_register (dpo_type_t type, + const dpo_vft_t *vft, + const char * const * const * nodes) +{ + vec_validate(dpo_vfts, type); + dpo_vfts[type] = *vft; + if (NULL == dpo_vfts[type].dv_get_next_node) + { + dpo_vfts[type].dv_get_next_node = dpo_default_get_next_node; + } + + vec_validate(dpo_nodes, type); + dpo_nodes[type] = nodes; +} + +dpo_type_t +dpo_register_new_type (const dpo_vft_t *vft, + const char * const * const * nodes) +{ + dpo_type_t type = dpo_dynamic++; + + dpo_register(type, vft, nodes); + + return (type); +} + +void +dpo_lock (dpo_id_t *dpo) +{ + if (!dpo_id_is_valid(dpo)) + return; + + dpo_vfts[dpo->dpoi_type].dv_lock(dpo); +} + +void +dpo_unlock (dpo_id_t *dpo) +{ + if (!dpo_id_is_valid(dpo)) + return; + + dpo_vfts[dpo->dpoi_type].dv_unlock(dpo); +} + + +static u32 +dpo_get_next_node (dpo_type_t child_type, + dpo_proto_t child_proto, + const dpo_id_t *parent_dpo) +{ + dpo_proto_t parent_proto; + dpo_type_t parent_type; + + parent_type = parent_dpo->dpoi_type; + parent_proto = parent_dpo->dpoi_proto; + + vec_validate(dpo_edges, child_type); + vec_validate(dpo_edges[child_type], child_proto); + vec_validate(dpo_edges[child_type][child_proto], parent_type); + vec_validate_init_empty( + dpo_edges[child_type][child_proto][parent_type], + parent_proto, ~0); + + /* + * if the edge index has not yet been created for this node to node transistion + */ + if (~0 == dpo_edges[child_type][child_proto][parent_type][parent_proto]) + { + vlib_node_t *child_node; + u32 *parent_indices; + vlib_main_t *vm; + u32 edge, *pi, cc; + + vm = vlib_get_main(); + + ASSERT(NULL != dpo_vfts[parent_type].dv_get_next_node); + ASSERT(NULL != dpo_nodes[child_type]); + ASSERT(NULL != dpo_nodes[child_type][child_proto]); + + cc = 0; + parent_indices = dpo_vfts[parent_type].dv_get_next_node(parent_dpo); + + vlib_worker_thread_barrier_sync(vm); + + /* + * create a graph arc from each of the child's registered node types, + * to each of the parent's. + */ + while (NULL != dpo_nodes[child_type][child_proto][cc]) + { + child_node = + vlib_get_node_by_name(vm, + (u8*) dpo_nodes[child_type][child_proto][cc]); + + vec_foreach(pi, parent_indices) + { + edge = vlib_node_add_next(vm, child_node->index, *pi); + + if (~0 == dpo_edges[child_type][child_proto][parent_type][parent_proto]) + { + dpo_edges[child_type][child_proto][parent_type][parent_proto] = edge; + } + else + { + ASSERT(dpo_edges[child_type][child_proto][parent_type][parent_proto] == edge); + } + } + cc++; + } + + vlib_worker_thread_barrier_release(vm); + vec_free(parent_indices); + } + + return (dpo_edges[child_type][child_proto][parent_type][parent_proto]); +} + +/** + * @brief Stack one DPO object on another, and thus establish a child parent + * relationship. The VLIB graph arc used is taken from the parent and child types + * passed. + */ +static void +dpo_stack_i (u32 edge, + dpo_id_t *dpo, + const dpo_id_t *parent) +{ + /* + * in order to get an atomic update of the parent we create a temporary, + * from a copy of the child, and add the next_node. then we copy to the parent + */ + dpo_id_t tmp = DPO_INVALID; + dpo_copy(&tmp, parent); + + /* + * get the edge index for the parent to child VLIB graph transisition + */ + tmp.dpoi_next_node = edge; + + /* + * this update is atomic. + */ + dpo_copy(dpo, &tmp); + + dpo_reset(&tmp); +} + +/** + * @brief Stack one DPO object on another, and thus establish a child-parent + * relationship. The VLIB graph arc used is taken from the parent and child types + * passed. + */ +void +dpo_stack (dpo_type_t child_type, + dpo_proto_t child_proto, + dpo_id_t *dpo, + const dpo_id_t *parent) +{ + dpo_stack_i(dpo_get_next_node(child_type, child_proto, parent), dpo, parent); +} + +/** + * @brief Stack one DPO object on another, and thus establish a child parent + * relationship. A new VLIB graph arc is created from the child node passed + * to the nodes registered by the parent. The VLIB infra will ensure this arc + * is added only once. + */ +void +dpo_stack_from_node (u32 child_node_index, + dpo_id_t *dpo, + const dpo_id_t *parent) +{ + dpo_type_t parent_type; + u32 *parent_indices; + vlib_main_t *vm; + u32 edge, *pi; + + edge = 0; + parent_type = parent->dpoi_type; + vm = vlib_get_main(); + + ASSERT(NULL != dpo_vfts[parent_type].dv_get_next_node); + parent_indices = dpo_vfts[parent_type].dv_get_next_node(parent); + ASSERT(parent_indices); + + /* + * This loop is purposefully written with the worker thread lock in the + * inner loop because; + * 1) the likelihood that the edge does not exist is smaller + * 2) the likelihood there is more than one node is even smaller + * so we are optimising for not need to take the lock + */ + vec_foreach(pi, parent_indices) + { + edge = vlib_node_get_next(vm, child_node_index, *pi); + + if (~0 == edge) + { + vlib_worker_thread_barrier_sync(vm); + + edge = vlib_node_add_next(vm, child_node_index, *pi); + + vlib_worker_thread_barrier_release(vm); + } + } + dpo_stack_i(edge, dpo, parent); +} + +static clib_error_t * +dpo_module_init (vlib_main_t * vm) +{ + drop_dpo_module_init(); + punt_dpo_module_init(); + receive_dpo_module_init(); + load_balance_module_init(); + mpls_label_dpo_module_init(); + classify_dpo_module_init(); + lookup_dpo_module_init(); + ip_null_dpo_module_init(); + replicate_module_init(); + interface_rx_dpo_module_init(); + interface_tx_dpo_module_init(); + mpls_disp_dpo_module_init(); + + return (NULL); +} + +VLIB_INIT_FUNCTION(dpo_module_init); + +static clib_error_t * +dpo_memory_show (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + dpo_vft_t *vft; + + vlib_cli_output (vm, "DPO memory"); + vlib_cli_output (vm, "%=30s %=5s %=8s/%=9s totals", + "Name","Size", "in-use", "allocated"); + + vec_foreach(vft, dpo_vfts) + { + if (NULL != vft->dv_mem_show) + vft->dv_mem_show(); + } + + return (NULL); +} + +/* *INDENT-OFF* */ +/*? + * The '<em>sh dpo memory </em>' command displays the memory usage for each + * data-plane object type. + * + * @cliexpar + * @cliexstart{show dpo memory} + * DPO memory + * Name Size in-use /allocated totals + * load-balance 64 12 / 12 768/768 + * Adjacency 256 1 / 1 256/256 + * Receive 24 5 / 5 120/120 + * Lookup 12 0 / 0 0/0 + * Classify 12 0 / 0 0/0 + * MPLS label 24 0 / 0 0/0 + * @cliexend +?*/ +VLIB_CLI_COMMAND (show_fib_memory, static) = { + .path = "show dpo memory", + .function = dpo_memory_show, + .short_help = "show dpo memory", +}; +/* *INDENT-ON* */ diff --git a/src/vnet/dpo/dpo.h b/src/vnet/dpo/dpo.h new file mode 100644 index 00000000..33562968 --- /dev/null +++ b/src/vnet/dpo/dpo.h @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * A Data-Path Object is an object that represents actions that are + * applied to packets are they are switched through VPP's data-path. + * + * The DPO can be considered to be like is a base class that is specialised + * by other objects to provide concreate actions + * + * The VLIB graph nodes are graph of DPO types, the DPO graph is a graph of + * instances. + */ + +#ifndef __DPO_H__ +#define __DPO_H__ + +#include <vnet/vnet.h> + +/** + * @brief An index for adjacencies. + * Alas 'C' is not typesafe enough to b0rk when a u32 is used instead of + * an index_t. However, for us humans, we can glean much more intent + * from the declaration + * foo barindex_t t); + * than we can from + * foo bar(u32 t); + */ +typedef u32 index_t; + +/** + * @brief Invalid index - used when no index is known + * blazoned capitals INVALID speak volumes where ~0 does not. + */ +#define INDEX_INVALID ((index_t)(~0)) + +/** + * @brief Data path protocol. + * Actions performed on packets in the data-plane can be described and represented + * by protocol independent objects, i.e. ADJACENCY, but the spceifics actions + * required during ADJACENCY processing can be protocol dependent. For example, + * the adjacency rewrite node performs a ip4 checksum calculation, ip6 and MPLS + * do not, all 3 perform a TTL decrement. The VLIB graph nodes are thus protocol + * dependent, and thus each graph edge/arc is too. + * When programming a DPO's next node arc from child to parent it is thus required + * to know the parent's data-path protocol so the correct arc index can be used. + */ +typedef enum dpo_proto_t_ +{ + DPO_PROTO_IP4 = 0, + DPO_PROTO_IP6, + DPO_PROTO_MPLS, + DPO_PROTO_ETHERNET, + DPO_PROTO_NSH, +} __attribute__((packed)) dpo_proto_t; + +#define DPO_PROTO_NUM ((dpo_proto_t)(DPO_PROTO_NSH+1)) +#define DPO_PROTO_NONE ((dpo_proto_t)(DPO_PROTO_NUM+1)) + +#define DPO_PROTOS { \ + [DPO_PROTO_IP4] = "ip4", \ + [DPO_PROTO_IP6] = "ip6", \ + [DPO_PROTO_ETHERNET] = "ethernet", \ + [DPO_PROTO_MPLS] = "mpls", \ + [DPO_PROTO_NSH] = "nsh", \ +} + +#define FOR_EACH_DPO_PROTO(_proto) \ + for (_proto = DPO_PROTO_IP4; \ + _proto <= DPO_PROTO_NSH; \ + _proto++) + +/** + * @brief Common types of data-path objects + * New types can be dynamically added using dpo_register_new_type() + */ +typedef enum dpo_type_t_ { + /** + * A non-zero value first so we can spot unitialisation errors + */ + DPO_FIRST, + DPO_DROP, + DPO_IP_NULL, + DPO_PUNT, + /** + * @brief load-balancing over a choice of [un]equal cost paths + */ + DPO_LOAD_BALANCE, + DPO_REPLICATE, + DPO_ADJACENCY, + DPO_ADJACENCY_INCOMPLETE, + DPO_ADJACENCY_MIDCHAIN, + DPO_ADJACENCY_GLEAN, + DPO_ADJACENCY_MCAST, + DPO_ADJACENCY_MCAST_MIDCHAIN, + DPO_RECEIVE, + DPO_LOOKUP, + DPO_LISP_CP, + DPO_CLASSIFY, + DPO_MPLS_LABEL, + DPO_MPLS_DISPOSITION, + DPO_MFIB_ENTRY, + DPO_INTERFACE_RX, + DPO_INTERFACE_TX, + DPO_LAST, +} __attribute__((packed)) dpo_type_t; + +#define DPO_TYPE_NUM DPO_LAST + +#define DPO_TYPES { \ + [DPO_FIRST] = "dpo-invalid", \ + [DPO_DROP] = "dpo-drop", \ + [DPO_IP_NULL] = "dpo-ip-null", \ + [DPO_PUNT] = "dpo-punt", \ + [DPO_ADJACENCY] = "dpo-adjacency", \ + [DPO_ADJACENCY_INCOMPLETE] = "dpo-adjacency-incomplete", \ + [DPO_ADJACENCY_MIDCHAIN] = "dpo-adjacency-midcahin", \ + [DPO_ADJACENCY_GLEAN] = "dpo-glean", \ + [DPO_ADJACENCY_MCAST] = "dpo-adj-mcast", \ + [DPO_ADJACENCY_MCAST_MIDCHAIN] = "dpo-adj-mcast-midchain", \ + [DPO_RECEIVE] = "dpo-receive", \ + [DPO_LOOKUP] = "dpo-lookup", \ + [DPO_LOAD_BALANCE] = "dpo-load-balance", \ + [DPO_REPLICATE] = "dpo-replicate", \ + [DPO_LISP_CP] = "dpo-lisp-cp", \ + [DPO_CLASSIFY] = "dpo-classify", \ + [DPO_MPLS_LABEL] = "dpo-mpls-label", \ + [DPO_MPLS_DISPOSITION] = "dpo-mpls-diposition", \ + [DPO_MFIB_ENTRY] = "dpo-mfib_entry", \ + [DPO_INTERFACE_RX] = "dpo-interface-rx", \ + [DPO_INTERFACE_TX] = "dpo-interface-tx" \ +} + +/** + * @brief The identity of a DPO is a combination of its type and its + * instance number/index of objects of that type + */ +typedef struct dpo_id_t_ { + /** + * the type + */ + dpo_type_t dpoi_type; + /** + * the data-path protocol of the type. + */ + dpo_proto_t dpoi_proto; + /** + * The next VLIB node to follow. + */ + u16 dpoi_next_node; + /** + * the index of objects of that type + */ + index_t dpoi_index; +} __attribute__ ((aligned(sizeof(u64)))) dpo_id_t; + +STATIC_ASSERT(sizeof(dpo_id_t) <= sizeof(u64), + "DPO ID is greater than sizeof u64 " + "atomic updates need to be revisited"); + +/** + * @brief An initialiser for DPOs declared on the stack. + * Thenext node is set to 0 since VLIB graph nodes should set 0 index to drop. + */ +#define DPO_INVALID \ +{ \ + .dpoi_type = DPO_FIRST, \ + .dpoi_proto = DPO_PROTO_NONE, \ + .dpoi_index = INDEX_INVALID, \ + .dpoi_next_node = 0, \ +} + +/** + * @brief Return true if the DPO object is valid, i.e. has been initialised. + */ +static inline int +dpo_id_is_valid (const dpo_id_t *dpoi) +{ + return (dpoi->dpoi_type != DPO_FIRST && + dpoi->dpoi_index != INDEX_INVALID); +} + +extern dpo_proto_t vnet_link_to_dpo_proto(vnet_link_t linkt); + +/** + * @brief + * Take a reference counting lock on the DPO + */ +extern void dpo_lock(dpo_id_t *dpo); + +/** + * @brief + * Release a reference counting lock on the DPO + */ +extern void dpo_unlock(dpo_id_t *dpo); + +/** + * @brief Set/create a DPO ID + * The DPO will be locked. + * + * @param dpo + * The DPO object to configure + * + * @param type + * The dpo_type_t of the DPO + * + * @param proto + * The dpo_proto_t of the DPO + * + * @param index + * The type specific index of the DPO + */ +extern void dpo_set(dpo_id_t *dpo, + dpo_type_t type, + dpo_proto_t proto, + index_t index); + +/** + * @brief reset a DPO ID + * The DPO will be unlocked. + * + * @param dpo + * The DPO object to reset + */ +extern void dpo_reset(dpo_id_t *dpo); + +/** + * @brief compare two DPOs for equality + */ +extern int dpo_cmp(const dpo_id_t *dpo1, + const dpo_id_t *dpo2); + +/** + * @brief + * atomic copy a data-plane object. + * This is safe to use when the dst DPO is currently switching packets + */ +extern void dpo_copy(dpo_id_t *dst, + const dpo_id_t *src); + +/** + * @brief Return TRUE is the DPO is any type of adjacency + */ +extern int dpo_is_adj(const dpo_id_t *dpo); + +/** + * @biref Format a DPO_id_t oject + */ +extern u8 *format_dpo_id(u8 * s, va_list * args); + +/** + * @biref format a DPO type + */ +extern u8 *format_dpo_type(u8 * s, va_list * args); + +/** + * @brief format a DPO protocol + */ +extern u8 *format_dpo_proto(u8 * s, va_list * args); + +/** + * @brief format a DPO protocol + */ +extern vnet_link_t dpo_proto_to_link(dpo_proto_t dp); + +/** + * @brief + * Set and stack a DPO. + * The DPO passed is set to the parent DPO and the necessary + * VLIB graph arcs are created. The child_type and child_proto + * are used to get the VLID nodes from which the arcs are added. + * + * @param child_type + * Child DPO type. + * + * @param child_proto + * Child DPO proto + * + * @parem dpo + * This is the DPO to stack and set. + * + * @paren parent_dpo + * The parent DPO to stack onto. + */ +extern void dpo_stack(dpo_type_t child_type, + dpo_proto_t child_proto, + dpo_id_t *dpo, + const dpo_id_t *parent_dpo); + +/** + * @brief + * Set and stack a DPO. + * The DPO passed is set to the parent DPO and the necessary + * VLIB graph arcs are created, from the child_node passed. + * + * @param child_node + * The VLIB grpah node index to create an arc from to the parent + * + * @parem dpo + * This is the DPO to stack and set. + * + * @paren parent_dpo + * The parent DPO to stack onto. + */ +extern void dpo_stack_from_node(u32 child_node, + dpo_id_t *dpo, + const dpo_id_t *parent); + +/** + * @brief A lock function registered for a DPO type + */ +typedef void (*dpo_lock_fn_t)(dpo_id_t *dpo); + +/** + * @brief An unlock function registered for a DPO type + */ +typedef void (*dpo_unlock_fn_t)(dpo_id_t *dpo); + +/** + * @brief An memory usage show command + */ +typedef void (*dpo_mem_show_t)(void); + +/** + * @brief Given a DPO instance return a vector of node indices that + * the type/instance will use. + */ +typedef u32* (*dpo_get_next_node_t)(const dpo_id_t *dpo); + +/** + * @brief A virtual function table regisitered for a DPO type + */ +typedef struct dpo_vft_t_ +{ + /** + * A reference counting lock function + */ + dpo_lock_fn_t dv_lock; + /** + * A reference counting unlock function + */ + dpo_lock_fn_t dv_unlock; + /** + * A format function + */ + format_function_t *dv_format; + /** + * A show memory usage function + */ + dpo_mem_show_t dv_mem_show; + /** + * A function to get the next VLIB node given an instance + * of the DPO. If this is null, then the node's name MUST be + * retreiveable from the nodes names array passed in the register + * function + */ + dpo_get_next_node_t dv_get_next_node; +} dpo_vft_t; + + +/** + * @brief For a given DPO type Register: + * - a virtual function table + * - a NULL terminated array of graph nodes from which that object type + * will originate packets, i.e. the nodes in which the object type will be + * the parent DPO in the DP graph. The ndoes are per-data-path protocol + * (see above). + * + * @param type + * The type being registered. + * + * @param vft + * The virtual function table to register for the type. + * + * @param nodes + * The string description of the per-protocol VLIB graph nodes. + */ +extern void dpo_register(dpo_type_t type, + const dpo_vft_t *vft, + const char * const * const * nodes); + +/** + * @brief Create and register a new DPO type. + * + * This can be used by plugins to create new DPO types that are not listed + * in dpo_type_t enum + * + * @param vft + * The virtual function table to register for the type. + * + * @param nodes + * The string description of the per-protocol VLIB graph nodes. + * + * @return The new dpo_type_t + */ +extern dpo_type_t dpo_register_new_type(const dpo_vft_t *vft, + const char * const * const * nodes); + +#endif diff --git a/src/vnet/dpo/drop_dpo.c b/src/vnet/dpo/drop_dpo.c new file mode 100644 index 00000000..a1821ddd --- /dev/null +++ b/src/vnet/dpo/drop_dpo.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing dropping the packet + */ + +#include <vnet/dpo/dpo.h> + +static dpo_id_t drop_dpos[DPO_PROTO_NUM]; + +const dpo_id_t * +drop_dpo_get (dpo_proto_t proto) +{ + dpo_set(&drop_dpos[proto], DPO_DROP, proto, proto); + + return (&drop_dpos[proto]); +} + +int +dpo_is_drop (const dpo_id_t *dpo) +{ + return (dpo->dpoi_type == DPO_DROP); +} + +static void +drop_dpo_lock (dpo_id_t *dpo) +{ + /* + * not maintaining a lock count on the drop + * more trouble than it's worth. + * There always needs to be one around. no point it managaing its lifetime + */ +} +static void +drop_dpo_unlock (dpo_id_t *dpo) +{ +} + +static u8* +format_drop_dpo (u8 *s, va_list *ap) +{ + CLIB_UNUSED(index_t index) = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + + return (format(s, "dpo-drop %U", format_dpo_proto, index)); +} + +const static dpo_vft_t drop_vft = { + .dv_lock = drop_dpo_lock, + .dv_unlock = drop_dpo_unlock, + .dv_format = format_drop_dpo, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a drop + * object. + * + * this means that these graph nodes are ones from which a drop is the + * parent object in the DPO-graph. + */ +const static char* const drop_ip4_nodes[] = +{ + "ip4-drop", + NULL, +}; +const static char* const drop_ip6_nodes[] = +{ + "ip6-drop", + NULL, +}; +const static char* const drop_mpls_nodes[] = +{ + "mpls-drop", + NULL, +}; +const static char* const drop_ethernet_nodes[] = +{ + "error-drop", + NULL, +}; +const static char* const drop_nsh_nodes[] = +{ + "error-drop", + NULL, +}; +const static char* const * const drop_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = drop_ip4_nodes, + [DPO_PROTO_IP6] = drop_ip6_nodes, + [DPO_PROTO_MPLS] = drop_mpls_nodes, + [DPO_PROTO_ETHERNET] = drop_ethernet_nodes, + [DPO_PROTO_NSH] = drop_nsh_nodes, +}; + +void +drop_dpo_module_init (void) +{ + dpo_register(DPO_DROP, &drop_vft, drop_nodes); +} diff --git a/src/vnet/dpo/drop_dpo.h b/src/vnet/dpo/drop_dpo.h new file mode 100644 index 00000000..436df36c --- /dev/null +++ b/src/vnet/dpo/drop_dpo.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief The Drop DPO will drop all packets, no questions asked. It is valid + * for any packet protocol. + */ + +#ifndef __DROP_DPO_H__ +#define __DROP_DPO_H__ + +#include <vnet/dpo/dpo.h> + +extern int dpo_is_drop(const dpo_id_t *dpo); + +extern const dpo_id_t *drop_dpo_get(dpo_proto_t proto); + +extern void drop_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/interface_rx_dpo.c b/src/vnet/dpo/interface_rx_dpo.c new file mode 100644 index 00000000..a624f514 --- /dev/null +++ b/src/vnet/dpo/interface_rx_dpo.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/dpo/interface_rx_dpo.h> +#include <vnet/fib/fib_node.h> + +/* + * The 'DB' of interface DPOs. + * There is only one per-interface per-protocol, so this is a per-interface + * vector + */ +static index_t *interface_rx_dpo_db[DPO_PROTO_NUM]; + +static interface_rx_dpo_t * +interface_rx_dpo_alloc (void) +{ + interface_rx_dpo_t *ido; + + pool_get(interface_rx_dpo_pool, ido); + + return (ido); +} + +static inline interface_rx_dpo_t * +interface_rx_dpo_get_from_dpo (const dpo_id_t *dpo) +{ + ASSERT(DPO_INTERFACE_RX == dpo->dpoi_type); + + return (interface_rx_dpo_get(dpo->dpoi_index)); +} + +static inline index_t +interface_rx_dpo_get_index (interface_rx_dpo_t *ido) +{ + return (ido - interface_rx_dpo_pool); +} + +static void +interface_rx_dpo_lock (dpo_id_t *dpo) +{ + interface_rx_dpo_t *ido; + + ido = interface_rx_dpo_get_from_dpo(dpo); + ido->ido_locks++; +} + +static void +interface_rx_dpo_unlock (dpo_id_t *dpo) +{ + interface_rx_dpo_t *ido; + + ido = interface_rx_dpo_get_from_dpo(dpo); + ido->ido_locks--; + + if (0 == ido->ido_locks) + { + interface_rx_dpo_db[ido->ido_proto][ido->ido_sw_if_index] = + INDEX_INVALID; + pool_put(interface_rx_dpo_pool, ido); + } +} + +/* + * interface_rx_dpo_add_or_lock + * + * Add/create and lock a new or lock an existing for the interface DPO + * on the interface and protocol given + */ +void +interface_rx_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + dpo_id_t *dpo) +{ + interface_rx_dpo_t *ido; + + vec_validate_init_empty(interface_rx_dpo_db[proto], + sw_if_index, + INDEX_INVALID); + + if (INDEX_INVALID == interface_rx_dpo_db[proto][sw_if_index]) + { + ido = interface_rx_dpo_alloc(); + + ido->ido_sw_if_index = sw_if_index; + ido->ido_proto = proto; + + interface_rx_dpo_db[proto][sw_if_index] = + interface_rx_dpo_get_index(ido); + } + else + { + ido = interface_rx_dpo_get(interface_rx_dpo_db[proto][sw_if_index]); + } + + dpo_set(dpo, DPO_INTERFACE_RX, proto, interface_rx_dpo_get_index(ido)); +} + + +static clib_error_t * +interface_rx_dpo_interface_state_change (vnet_main_t * vnm, + u32 sw_if_index, + u32 flags) +{ + /* + */ + return (NULL); +} + +VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION( + interface_rx_dpo_interface_state_change); + +/** + * @brief Registered callback for HW interface state changes + */ +static clib_error_t * +interface_rx_dpo_hw_interface_state_change (vnet_main_t * vnm, + u32 hw_if_index, + u32 flags) +{ + return (NULL); +} + +VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION( + interface_rx_dpo_hw_interface_state_change); + +static clib_error_t * +interface_rx_dpo_interface_delete (vnet_main_t * vnm, + u32 sw_if_index, + u32 is_add) +{ + return (NULL); +} + +VNET_SW_INTERFACE_ADD_DEL_FUNCTION( + interface_rx_dpo_interface_delete); + +u8* +format_interface_rx_dpo (u8* s, va_list *ap) +{ + index_t index = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + vnet_main_t * vnm = vnet_get_main(); + interface_rx_dpo_t *ido = interface_rx_dpo_get(index); + + return (format(s, "%U-dpo: %U", + format_vnet_sw_interface_name, + vnm, + vnet_get_sw_interface(vnm, ido->ido_sw_if_index), + format_dpo_proto, ido->ido_proto)); +} + +static void +interface_rx_dpo_mem_show (void) +{ + fib_show_memory_usage("Interface", + pool_elts(interface_rx_dpo_pool), + pool_len(interface_rx_dpo_pool), + sizeof(interface_rx_dpo_t)); +} + + +const static dpo_vft_t interface_rx_dpo_vft = { + .dv_lock = interface_rx_dpo_lock, + .dv_unlock = interface_rx_dpo_unlock, + .dv_format = format_interface_rx_dpo, + .dv_mem_show = interface_rx_dpo_mem_show, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a glean + * object. + * + * this means that these graph nodes are ones from which a glean is the + * parent object in the DPO-graph. + */ +const static char* const interface_rx_dpo_ip4_nodes[] = +{ + "interface-rx-dpo-ip4", + NULL, +}; +const static char* const interface_rx_dpo_ip6_nodes[] = +{ + "interface-rx-dpo-ip6", + NULL, +}; +const static char* const interface_rx_dpo_l2_nodes[] = +{ + "interface-rx-dpo-l2", + NULL, +}; + +const static char* const * const interface_rx_dpo_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = interface_rx_dpo_ip4_nodes, + [DPO_PROTO_IP6] = interface_rx_dpo_ip6_nodes, + [DPO_PROTO_ETHERNET] = interface_rx_dpo_l2_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + +void +interface_rx_dpo_module_init (void) +{ + dpo_register(DPO_INTERFACE_RX, + &interface_rx_dpo_vft, + interface_rx_dpo_nodes); +} + +/** + * @brief Interface DPO trace data + */ +typedef struct interface_rx_dpo_trace_t_ +{ + u32 sw_if_index; +} interface_rx_dpo_trace_t; + +typedef enum interface_rx_dpo_next_t_ +{ + INTERFACE_RX_DPO_DROP = 0, + INTERFACE_RX_DPO_INPUT = 1, +} interface_rx_dpo_next_t; + +always_inline uword +interface_rx_dpo_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + u32 n_left_from, next_index, * from, * to_next; + u32 thread_index = vlib_get_thread_index (); + vnet_interface_main_t *im; + + im = &vnet_get_main ()->interface_main; + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next > 2) + { + const interface_rx_dpo_t *ido0, *ido1; + u32 bi0, idoi0, bi1, idoi1; + vlib_buffer_t *b0, *b1; + + bi0 = from[0]; + to_next[0] = bi0; + bi1 = from[1]; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + idoi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + idoi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + ido0 = interface_rx_dpo_get(idoi0); + ido1 = interface_rx_dpo_get(idoi1); + + vnet_buffer(b0)->sw_if_index[VLIB_RX] = ido0->ido_sw_if_index; + vnet_buffer(b1)->sw_if_index[VLIB_RX] = ido1->ido_sw_if_index; + + vlib_increment_combined_counter (im->combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + thread_index, + ido0->ido_sw_if_index, + 1, + vlib_buffer_length_in_chain (vm, b0)); + vlib_increment_combined_counter (im->combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + thread_index, + ido1->ido_sw_if_index, + 1, + vlib_buffer_length_in_chain (vm, b1)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + interface_rx_dpo_trace_t *tr0; + + tr0 = vlib_add_trace (vm, node, b0, sizeof (*tr0)); + tr0->sw_if_index = ido0->ido_sw_if_index; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + interface_rx_dpo_trace_t *tr1; + + tr1 = vlib_add_trace (vm, node, b1, sizeof (*tr1)); + tr1->sw_if_index = ido1->ido_sw_if_index; + } + + vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, + n_left_to_next, bi0, bi1, + INTERFACE_RX_DPO_INPUT, + INTERFACE_RX_DPO_INPUT); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + const interface_rx_dpo_t * ido0; + vlib_buffer_t * b0; + u32 bi0, idoi0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + idoi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + ido0 = interface_rx_dpo_get(idoi0); + + /* Swap the RX interface of the packet to the one the + * interface DPR represents */ + vnet_buffer(b0)->sw_if_index[VLIB_RX] = ido0->ido_sw_if_index; + + /* Bump the interface's RX coutners */ + vlib_increment_combined_counter (im->combined_sw_if_counters + + VNET_INTERFACE_COUNTER_RX, + thread_index, + ido0->ido_sw_if_index, + 1, + vlib_buffer_length_in_chain (vm, b0)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + interface_rx_dpo_trace_t *tr; + + tr = vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->sw_if_index = ido0->ido_sw_if_index; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, + INTERFACE_RX_DPO_INPUT); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_interface_rx_dpo_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + interface_rx_dpo_trace_t * t = va_arg (*args, interface_rx_dpo_trace_t *); + uword indent = format_get_indent (s); + s = format (s, "%U sw_if_index:%d", + format_white_space, indent, + t->sw_if_index); + return s; +} + +static uword +interface_rx_dpo_ip4 (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (interface_rx_dpo_inline(vm, node, from_frame)); +} + +static uword +interface_rx_dpo_ip6 (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (interface_rx_dpo_inline(vm, node, from_frame)); +} + +static uword +interface_rx_dpo_l2 (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (interface_rx_dpo_inline(vm, node, from_frame)); +} + +VLIB_REGISTER_NODE (interface_rx_dpo_ip4_node) = { + .function = interface_rx_dpo_ip4, + .name = "interface-rx-dpo-ip4", + .vector_size = sizeof (u32), + .format_trace = format_interface_rx_dpo_trace, + + .n_next_nodes = 2, + .next_nodes = { + [INTERFACE_RX_DPO_DROP] = "ip4-drop", + [INTERFACE_RX_DPO_INPUT] = "ip4-input", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (interface_rx_dpo_ip4_node, + interface_rx_dpo_ip4) + +VLIB_REGISTER_NODE (interface_rx_dpo_ip6_node) = { + .function = interface_rx_dpo_ip6, + .name = "interface-rx-dpo-ip6", + .vector_size = sizeof (u32), + .format_trace = format_interface_rx_dpo_trace, + + .n_next_nodes = 2, + .next_nodes = { + [INTERFACE_RX_DPO_DROP] = "ip6-drop", + [INTERFACE_RX_DPO_INPUT] = "ip6-input", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (interface_rx_dpo_ip6_node, + interface_rx_dpo_ip6) + +VLIB_REGISTER_NODE (interface_rx_dpo_l2_node) = { + .function = interface_rx_dpo_l2, + .name = "interface-rx-dpo-l2", + .vector_size = sizeof (u32), + .format_trace = format_interface_rx_dpo_trace, + + .n_next_nodes = 2, + .next_nodes = { + [INTERFACE_RX_DPO_DROP] = "error-drop", + [INTERFACE_RX_DPO_INPUT] = "l2-input", + }, +}; + +VLIB_NODE_FUNCTION_MULTIARCH (interface_rx_dpo_l2_node, + interface_rx_dpo_l2) diff --git a/src/vnet/dpo/interface_rx_dpo.h b/src/vnet/dpo/interface_rx_dpo.h new file mode 100644 index 00000000..edecce08 --- /dev/null +++ b/src/vnet/dpo/interface_rx_dpo.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __INTERFACE_RX_DPO_H__ +#define __INTERFACE_RX_DPO_H__ + +#include <vnet/dpo/dpo.h> + +/** + * @brief + * The data-path object representing a change of receive interface. + * If a packet encounters an object of this type in the data-path, it's + * RX interface is changed. + */ +typedef struct interface_rx_dpo_t_ +{ + /** + * The Software interface index that the packets will be given + * as the ingress/rx interface + */ + u32 ido_sw_if_index; + + /** + * next VLIB node. A '<proto>-input' node. + */ + u32 ido_next_node; + + /** + * DPO protocol that the packets will have as they 'ingress' + * on this interface + */ + dpo_proto_t ido_proto; + + /** + * number of locks. + */ + u16 ido_locks; +} interface_rx_dpo_t; + +extern void interface_rx_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + dpo_id_t *dpo); + +extern void interface_rx_dpo_module_init(void); + +/** + * @brief pool of all interface DPOs + */ +interface_rx_dpo_t *interface_rx_dpo_pool; + +static inline interface_rx_dpo_t * +interface_rx_dpo_get (index_t index) +{ + return (pool_elt_at_index(interface_rx_dpo_pool, index)); +} + +#endif diff --git a/src/vnet/dpo/interface_tx_dpo.c b/src/vnet/dpo/interface_tx_dpo.c new file mode 100644 index 00000000..f7c8bfda --- /dev/null +++ b/src/vnet/dpo/interface_tx_dpo.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/dpo/interface_tx_dpo.h> +#include <vnet/adj/rewrite.h> + +/* + * We do not lock nor unlock these DPOs since there is nothing to lock + * all we do is construct DPO object wrappers around a sw_if_index + */ +static void +interface_tx_dpo_lock (dpo_id_t *dpo) +{ +} + +static void +interface_tx_dpo_unlock (dpo_id_t *dpo) +{ +} + +/* + * interface_tx_dpo_add_or_lock + * + * construct DPO object wrappers around a sw_if_index + */ +void +interface_tx_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + dpo_id_t *dpo) +{ + dpo_set(dpo, DPO_INTERFACE_TX, proto, sw_if_index); +} + +u8* +format_interface_tx_dpo (u8* s, va_list *ap) +{ + index_t index = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + vnet_main_t * vnm = vnet_get_main(); + + return (format(s, "%U-dpo:", + format_vnet_sw_interface_name, + vnm, + vnet_get_sw_interface(vnm, index))); +} + +static void +interface_tx_dpo_mem_show (void) +{ +} + +u32* +interface_tx_dpo_get_next_node (const dpo_id_t *dpo) +{ + u32 *node_indices = NULL; + + /* + * return the interface's TX node for the wrapped sw_if_index + */ + vec_add1(node_indices, + vnet_tx_node_index_for_sw_interface(vnet_get_main(), + dpo->dpoi_index)); + + return (node_indices); +} + +const static dpo_vft_t interface_tx_dpo_vft = { + .dv_lock = interface_tx_dpo_lock, + .dv_unlock = interface_tx_dpo_unlock, + .dv_format = format_interface_tx_dpo, + .dv_mem_show = interface_tx_dpo_mem_show, + .dv_get_next_node = interface_tx_dpo_get_next_node, +}; + +void +interface_tx_dpo_module_init (void) +{ + dpo_register(DPO_INTERFACE_TX, &interface_tx_dpo_vft, NULL); +} + diff --git a/src/vnet/dpo/interface_tx_dpo.h b/src/vnet/dpo/interface_tx_dpo.h new file mode 100644 index 00000000..0c560ada --- /dev/null +++ b/src/vnet/dpo/interface_tx_dpo.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing transmitting the packet on a n interface. + * This is a convenient DPO wrapper around a simple interface transmit and thus + * allows us to represent direct interface transmit in the DPO model. + */ + +#ifndef __INTERFACE_TX_DPO_H__ +#define __INTERFACE_TX_DPO_H__ + +#include <vnet/dpo/dpo.h> + +extern void interface_tx_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + dpo_id_t *dpo); + +extern void interface_tx_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/ip_null_dpo.c b/src/vnet/dpo/ip_null_dpo.c new file mode 100644 index 00000000..22682e4e --- /dev/null +++ b/src/vnet/dpo/ip_null_dpo.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing dropping the packet + */ + +#include <vnet/dpo/ip_null_dpo.h> +#include <vnet/ip/ip.h> + +/** + * @brief A representation of the IP_NULL DPO + */ +typedef struct ip_null_dpo_t_ +{ + /** + * @brief The action to take on a packet + */ + ip_null_dpo_action_t ind_action; + /** + * @brief The next VLIB node + */ + u32 ind_next_index; + /** + * rate limits + */ +} ip_null_dpo_t; + +/** + * @brief the IP_NULL dpos are shared by all routes, hence they are global. + * As the neame implies this is only for IP, hence 2. + */ +static ip_null_dpo_t ip_null_dpos[2 * IP_NULL_DPO_ACTION_NUM] = { + [0] = { + /* proto ip4, no action */ + .ind_action = IP_NULL_ACTION_NONE, + }, + [1] = { + /* proto ip4, action send unreach */ + .ind_action = IP_NULL_ACTION_SEND_ICMP_UNREACH, + }, + [2] = { + /* proto ip4, action send unreach */ + .ind_action = IP_NULL_ACTION_SEND_ICMP_PROHIBIT, + }, + [3] = { + /* proto ip6, no action */ + .ind_action = IP_NULL_ACTION_NONE, + }, + [4] = { + /* proto ip6, action send unreach */ + .ind_action = IP_NULL_ACTION_SEND_ICMP_UNREACH, + }, + [5] = { + /* proto ip6, action send unreach */ + .ind_action = IP_NULL_ACTION_SEND_ICMP_PROHIBIT, + }, +}; + +/** + * @brief Action strings + */ +const char *ip_null_action_strings[] = IP_NULL_ACTIONS; + +void +ip_null_dpo_add_and_lock (dpo_proto_t proto, + ip_null_dpo_action_t action, + dpo_id_t *dpo) +{ + int i; + + ASSERT((proto == DPO_PROTO_IP4) || + (proto == DPO_PROTO_IP6)); + ASSERT(action < IP_NULL_DPO_ACTION_NUM); + + i = (proto == DPO_PROTO_IP4 ? 0 : 1); + + dpo_set(dpo, DPO_IP_NULL, proto, (i*IP_NULL_DPO_ACTION_NUM) + action); +} + +always_inline const ip_null_dpo_t* +ip_null_dpo_get (index_t indi) +{ + return (&ip_null_dpos[indi]); +} + +static void +ip_null_dpo_lock (dpo_id_t *dpo) +{ + /* + * not maintaining a lock count on the ip_null, they are const global and + * never die. + */ +} +static void +ip_null_dpo_unlock (dpo_id_t *dpo) +{ +} + +static u8* +format_ip_null_dpo (u8 *s, va_list *ap) +{ + index_t index = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + const ip_null_dpo_t *ind; + dpo_proto_t proto; + + ind = ip_null_dpo_get(index); + proto = (index < IP_NULL_DPO_ACTION_NUM ? DPO_PROTO_IP4 : DPO_PROTO_IP6); + + return (format(s, "%U-null action:%s", + format_dpo_proto, proto, + ip_null_action_strings[ind->ind_action])); +} + +const static dpo_vft_t ip_null_vft = { + .dv_lock = ip_null_dpo_lock, + .dv_unlock = ip_null_dpo_unlock, + .dv_format = format_ip_null_dpo, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a ip_null + * object. + * + * this means that these graph nodes are ones from which a ip_null is the + * parent object in the DPO-graph. + */ +const static char* const ip4_null_nodes[] = +{ + "ip4-null", + NULL, +}; +const static char* const ip6_null_nodes[] = +{ + "ip6-null", + NULL, +}; + +const static char* const * const ip_null_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = ip4_null_nodes, + [DPO_PROTO_IP6] = ip6_null_nodes, +}; + +typedef struct ip_null_dpo_trace_t_ +{ + index_t ind_index; +} ip_null_dpo_trace_t; + +/** + * @brief Exit nodes from a IP_NULL + */ +typedef enum ip_null_next_t_ +{ + IP_NULL_NEXT_DROP, + IP_NULL_NEXT_ICMP, + IP_NULL_NEXT_NUM, +} ip_null_next_t; + +always_inline uword +ip_null_dpo_switch (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + u8 is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + static f64 time_last_seed_change = -1e100; + static u32 hash_seeds[3]; + static uword hash_bitmap[256 / BITS (uword)]; + f64 time_now; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + time_now = vlib_time_now (vm); + if (time_now - time_last_seed_change > 1e-1) + { + uword i; + u32 * r = clib_random_buffer_get_data (&vm->random_buffer, + sizeof (hash_seeds)); + for (i = 0; i < ARRAY_LEN (hash_seeds); i++) + hash_seeds[i] = r[i]; + + /* Mark all hash keys as been not-seen before. */ + for (i = 0; i < ARRAY_LEN (hash_bitmap); i++) + hash_bitmap[i] = 0; + + time_last_seed_change = time_now; + } + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 a0, b0, c0, m0, drop0; + vlib_buffer_t *p0; + u32 bi0, indi0, next0; + const ip_null_dpo_t *ind0; + uword bm0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + + /* lookup dst + src mac */ + indi0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + ind0 = ip_null_dpo_get(indi0); + next0 = IP_NULL_NEXT_DROP; + + /* + * rate limit - don't DoS the sender. + */ + a0 = hash_seeds[0]; + b0 = hash_seeds[1]; + c0 = hash_seeds[2]; + + if (is_ip4) + { + ip4_header_t *ip0 = vlib_buffer_get_current (p0); + + a0 ^= ip0->dst_address.data_u32; + b0 ^= ip0->src_address.data_u32; + + hash_v3_finalize32 (a0, b0, c0); + } + else + { + ip6_header_t *ip0 = vlib_buffer_get_current (p0); + + a0 ^= ip0->dst_address.as_u32[0]; + b0 ^= ip0->src_address.as_u32[0]; + c0 ^= ip0->src_address.as_u32[1]; + + hash_v3_mix32 (a0, b0, c0); + + a0 ^= ip0->dst_address.as_u32[1]; + b0 ^= ip0->src_address.as_u32[2]; + c0 ^= ip0->src_address.as_u32[3]; + + hash_v3_finalize32 (a0, b0, c0); + } + + c0 &= BITS (hash_bitmap) - 1; + c0 = c0 / BITS (uword); + m0 = (uword) 1 << (c0 % BITS (uword)); + + bm0 = hash_bitmap[c0]; + drop0 = (bm0 & m0) != 0; + + /* Mark it as seen. */ + hash_bitmap[c0] = bm0 | m0; + + if (PREDICT_FALSE(!drop0)) + { + if (is_ip4) + { + /* + * There's a trade-off here. This conditinal statement + * versus a graph node per-condition. Given the number + * expect number of packets to reach a null route is 0 + * we favour the run-time cost over the graph complexity + */ + if (IP_NULL_ACTION_SEND_ICMP_UNREACH == ind0->ind_action) + { + next0 = IP_NULL_NEXT_ICMP; + icmp4_error_set_vnet_buffer( + p0, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_destination_unreachable_host, + 0); + } + else if (IP_NULL_ACTION_SEND_ICMP_PROHIBIT == ind0->ind_action) + { + next0 = IP_NULL_NEXT_ICMP; + icmp4_error_set_vnet_buffer( + p0, + ICMP4_destination_unreachable, + ICMP4_destination_unreachable_host_administratively_prohibited, + 0); + } + } + else + { + if (IP_NULL_ACTION_SEND_ICMP_UNREACH == ind0->ind_action) + { + next0 = IP_NULL_NEXT_ICMP; + icmp6_error_set_vnet_buffer( + p0, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_no_route_to_destination, + 0); + } + else if (IP_NULL_ACTION_SEND_ICMP_PROHIBIT == ind0->ind_action) + { + next0 = IP_NULL_NEXT_ICMP; + icmp6_error_set_vnet_buffer( + p0, + ICMP6_destination_unreachable, + ICMP6_destination_unreachable_destination_administratively_prohibited, + 0); + } + } + } + + if (PREDICT_FALSE (p0->flags & VLIB_BUFFER_IS_TRACED)) + { + ip_null_dpo_trace_t *tr = vlib_add_trace (vm, node, p0, + sizeof (*tr)); + tr->ind_index = indi0; + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static u8 * +format_ip_null_dpo_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip_null_dpo_trace_t *t = va_arg (*args, ip_null_dpo_trace_t *); + + s = format (s, "%U", format_ip_null_dpo, t->ind_index, 0); + return s; +} + +static uword +ip4_null_dpo_switch (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (ip_null_dpo_switch(vm, node, frame, 1)); +} + +/** + * @brief + */ +VLIB_REGISTER_NODE (ip4_null_dpo_node) = { + .function = ip4_null_dpo_switch, + .name = "ip4-null", + .vector_size = sizeof (u32), + + .format_trace = format_ip_null_dpo_trace, + .n_next_nodes = IP_NULL_NEXT_NUM, + .next_nodes = { + [IP_NULL_NEXT_DROP] = "ip4-drop", + [IP_NULL_NEXT_ICMP] = "ip4-icmp-error", + }, +}; + +static uword +ip6_null_dpo_switch (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (ip_null_dpo_switch(vm, node, frame, 0)); +} + +/** + * @brief + */ +VLIB_REGISTER_NODE (ip6_null_dpo_node) = { + .function = ip6_null_dpo_switch, + .name = "ip6-null", + .vector_size = sizeof (u32), + + .format_trace = format_ip_null_dpo_trace, + .n_next_nodes = IP_NULL_NEXT_NUM, + .next_nodes = { + [IP_NULL_NEXT_DROP] = "ip6-drop", + [IP_NULL_NEXT_ICMP] = "ip6-icmp-error", + }, +}; + +void +ip_null_dpo_module_init (void) +{ + dpo_register(DPO_IP_NULL, &ip_null_vft, ip_null_nodes); +} diff --git a/src/vnet/dpo/ip_null_dpo.h b/src/vnet/dpo/ip_null_dpo.h new file mode 100644 index 00000000..002a2a70 --- /dev/null +++ b/src/vnet/dpo/ip_null_dpo.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The IP NULL DPO represents the rubbish bin for IP traffic. Without specifying an + * action (i.e. send IMCP type X to sender) it is equivalent to using a drop DPO. + * However, in contrast to the drop DPO any route that resovles via a NULL, is + * considered to 'resolved' by FIB, i.e. a IP NULL is used when the control plane + * is explicitly expressing the desire to drop packets. Drop DPOs are used + * internally by FIB when resolution is not possible. + * + * Any replies to sender are rate limited. + */ + +#ifndef __IP_NULL_DPO_H__ +#define __IP_NULL_DPO_H__ + +#include <vnet/dpo/dpo.h> + +/** + * @brief Actions to take when a packet encounters the NULL DPO + */ +typedef enum ip_null_dpo_action_t_ +{ + IP_NULL_ACTION_NONE, + IP_NULL_ACTION_SEND_ICMP_UNREACH, + IP_NULL_ACTION_SEND_ICMP_PROHIBIT, +} ip_null_dpo_action_t; + +#define IP_NULL_ACTIONS { \ + [IP_NULL_ACTION_NONE] = "discard", \ + [IP_NULL_ACTION_SEND_ICMP_UNREACH] = "send-unreachable", \ + [IP_NULL_ACTION_SEND_ICMP_PROHIBIT] = "send-prohibited", \ +} + +#define IP_NULL_DPO_ACTION_NUM (IP_NULL_ACTION_SEND_ICMP_PROHIBIT+1) + +extern void ip_null_dpo_add_and_lock (dpo_proto_t proto, + ip_null_dpo_action_t action, + dpo_id_t *dpo); + +extern void ip_null_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/load_balance.c b/src/vnet/dpo/load_balance.c new file mode 100644 index 00000000..af054f1c --- /dev/null +++ b/src/vnet/dpo/load_balance.c @@ -0,0 +1,1115 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/lookup.h> +#include <vnet/dpo/load_balance.h> +#include <vnet/dpo/load_balance_map.h> +#include <vnet/dpo/drop_dpo.h> +#include <vppinfra/math.h> /* for fabs */ +#include <vnet/adj/adj.h> +#include <vnet/adj/adj_internal.h> +#include <vnet/fib/fib_urpf_list.h> + +/* + * distribution error tolerance for load-balancing + */ +const f64 multipath_next_hop_error_tolerance = 0.1; + +#undef LB_DEBUG + +#ifdef LB_DEBUG +#define LB_DBG(_lb, _fmt, _args...) \ +{ \ + u8* _tmp =NULL; \ + clib_warning("lb:[%s]:" _fmt, \ + load_balance_format(load_balance_get_index((_lb)), \ + 0, _tmp), \ + ##_args); \ + vec_free(_tmp); \ +} +#else +#define LB_DBG(_p, _fmt, _args...) +#endif + + +/** + * Pool of all DPOs. It's not static so the DP can have fast access + */ +load_balance_t *load_balance_pool; + +/** + * The one instance of load-balance main + */ +load_balance_main_t load_balance_main; + +f64 +load_balance_get_multipath_tolerance (void) +{ + return (multipath_next_hop_error_tolerance); +} + +static inline index_t +load_balance_get_index (const load_balance_t *lb) +{ + return (lb - load_balance_pool); +} + +static inline dpo_id_t* +load_balance_get_buckets (load_balance_t *lb) +{ + if (LB_HAS_INLINE_BUCKETS(lb)) + { + return (lb->lb_buckets_inline); + } + else + { + return (lb->lb_buckets); + } +} + +static load_balance_t * +load_balance_alloc_i (void) +{ + load_balance_t *lb; + + pool_get_aligned(load_balance_pool, lb, CLIB_CACHE_LINE_BYTES); + memset(lb, 0, sizeof(*lb)); + + lb->lb_map = INDEX_INVALID; + lb->lb_urpf = INDEX_INVALID; + vlib_validate_combined_counter(&(load_balance_main.lbm_to_counters), + load_balance_get_index(lb)); + vlib_validate_combined_counter(&(load_balance_main.lbm_via_counters), + load_balance_get_index(lb)); + vlib_zero_combined_counter(&(load_balance_main.lbm_to_counters), + load_balance_get_index(lb)); + vlib_zero_combined_counter(&(load_balance_main.lbm_via_counters), + load_balance_get_index(lb)); + + return (lb); +} + +static u8* +load_balance_format (index_t lbi, + load_balance_format_flags_t flags, + u32 indent, + u8 *s) +{ + vlib_counter_t to, via; + load_balance_t *lb; + dpo_id_t *buckets; + u32 i; + + lb = load_balance_get(lbi); + vlib_get_combined_counter(&(load_balance_main.lbm_to_counters), lbi, &to); + vlib_get_combined_counter(&(load_balance_main.lbm_via_counters), lbi, &via); + buckets = load_balance_get_buckets(lb); + + s = format(s, "%U: ", format_dpo_type, DPO_LOAD_BALANCE); + s = format(s, "[proto:%U ", format_dpo_proto, lb->lb_proto); + s = format(s, "index:%d buckets:%d ", lbi, lb->lb_n_buckets); + s = format(s, "uRPF:%d ", lb->lb_urpf); + s = format(s, "to:[%Ld:%Ld]", to.packets, to.bytes); + if (0 != via.packets) + { + s = format(s, " via:[%Ld:%Ld]", + via.packets, via.bytes); + } + s = format(s, "]"); + + if (INDEX_INVALID != lb->lb_map) + { + s = format(s, "\n%U%U", + format_white_space, indent+4, + format_load_balance_map, lb->lb_map, indent+4); + } + for (i = 0; i < lb->lb_n_buckets; i++) + { + s = format(s, "\n%U[%d] %U", + format_white_space, indent+2, + i, + format_dpo_id, + &buckets[i], indent+6); + } + return (s); +} + +u8* +format_load_balance (u8 * s, va_list * args) +{ + index_t lbi = va_arg(*args, index_t); + load_balance_format_flags_t flags = va_arg(*args, load_balance_format_flags_t); + + return (load_balance_format(lbi, flags, 0, s)); +} +static u8* +format_load_balance_dpo (u8 * s, va_list * args) +{ + index_t lbi = va_arg(*args, index_t); + u32 indent = va_arg(*args, u32); + + return (load_balance_format(lbi, LOAD_BALANCE_FORMAT_DETAIL, indent, s)); +} + + +static load_balance_t * +load_balance_create_i (u32 num_buckets, + dpo_proto_t lb_proto, + flow_hash_config_t fhc) +{ + load_balance_t *lb; + + lb = load_balance_alloc_i(); + lb->lb_hash_config = fhc; + lb->lb_n_buckets = num_buckets; + lb->lb_n_buckets_minus_1 = num_buckets-1; + lb->lb_proto = lb_proto; + + if (!LB_HAS_INLINE_BUCKETS(lb)) + { + vec_validate_aligned(lb->lb_buckets, + lb->lb_n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + } + + LB_DBG(lb, "create"); + + return (lb); +} + +index_t +load_balance_create (u32 n_buckets, + dpo_proto_t lb_proto, + flow_hash_config_t fhc) +{ + return (load_balance_get_index(load_balance_create_i(n_buckets, lb_proto, fhc))); +} + +static inline void +load_balance_set_bucket_i (load_balance_t *lb, + u32 bucket, + dpo_id_t *buckets, + const dpo_id_t *next) +{ + dpo_stack(DPO_LOAD_BALANCE, lb->lb_proto, &buckets[bucket], next); +} + +void +load_balance_set_bucket (index_t lbi, + u32 bucket, + const dpo_id_t *next) +{ + load_balance_t *lb; + dpo_id_t *buckets; + + lb = load_balance_get(lbi); + buckets = load_balance_get_buckets(lb); + + ASSERT(bucket < lb->lb_n_buckets); + + load_balance_set_bucket_i(lb, bucket, buckets, next); +} + +int +load_balance_is_drop (const dpo_id_t *dpo) +{ + load_balance_t *lb; + + if (DPO_LOAD_BALANCE != dpo->dpoi_type) + return (0); + + lb = load_balance_get(dpo->dpoi_index); + + if (1 == lb->lb_n_buckets) + { + return (dpo_is_drop(load_balance_get_bucket_i(lb, 0))); + } + return (0); +} + +void +load_balance_set_fib_entry_flags (index_t lbi, + fib_entry_flag_t flags) +{ + load_balance_t *lb; + + lb = load_balance_get(lbi); + lb->lb_fib_entry_flags = flags; +} + + +void +load_balance_set_urpf (index_t lbi, + index_t urpf) +{ + load_balance_t *lb; + index_t old; + + lb = load_balance_get(lbi); + + /* + * packets in flight we see this change. but it's atomic, so :P + */ + old = lb->lb_urpf; + lb->lb_urpf = urpf; + + fib_urpf_list_unlock(old); + fib_urpf_list_lock(urpf); +} + +index_t +load_balance_get_urpf (index_t lbi) +{ + load_balance_t *lb; + + lb = load_balance_get(lbi); + + return (lb->lb_urpf); +} + +const dpo_id_t * +load_balance_get_bucket (index_t lbi, + u32 bucket) +{ + load_balance_t *lb; + + lb = load_balance_get(lbi); + + return (load_balance_get_bucket_i(lb, bucket)); +} + +static int +next_hop_sort_by_weight (const load_balance_path_t * n1, + const load_balance_path_t * n2) +{ + return ((int) n1->path_weight - (int) n2->path_weight); +} + +/* Given next hop vector is over-written with normalized one with sorted weights and + with weights corresponding to the number of adjacencies for each next hop. + Returns number of adjacencies in block. */ +u32 +ip_multipath_normalize_next_hops (const load_balance_path_t * raw_next_hops, + load_balance_path_t ** normalized_next_hops, + u32 *sum_weight_in, + f64 multipath_next_hop_error_tolerance) +{ + load_balance_path_t * nhs; + uword n_nhs, n_adj, n_adj_left, i, sum_weight; + f64 norm, error; + + n_nhs = vec_len (raw_next_hops); + ASSERT (n_nhs > 0); + if (n_nhs == 0) + return 0; + + /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */ + nhs = *normalized_next_hops; + vec_validate (nhs, 2*n_nhs - 1); + + /* Fast path: 1 next hop in block. */ + n_adj = n_nhs; + if (n_nhs == 1) + { + nhs[0] = raw_next_hops[0]; + nhs[0].path_weight = 1; + _vec_len (nhs) = 1; + sum_weight = 1; + goto done; + } + + else if (n_nhs == 2) + { + int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0; + + /* Fast sort. */ + nhs[0] = raw_next_hops[cmp]; + nhs[1] = raw_next_hops[cmp ^ 1]; + + /* Fast path: equal cost multipath with 2 next hops. */ + if (nhs[0].path_weight == nhs[1].path_weight) + { + nhs[0].path_weight = nhs[1].path_weight = 1; + _vec_len (nhs) = 2; + sum_weight = 2; + goto done; + } + } + else + { + clib_memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0])); + qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight); + } + + /* Find total weight to normalize weights. */ + sum_weight = 0; + for (i = 0; i < n_nhs; i++) + sum_weight += nhs[i].path_weight; + + /* In the unlikely case that all weights are given as 0, set them all to 1. */ + if (sum_weight == 0) + { + for (i = 0; i < n_nhs; i++) + nhs[i].path_weight = 1; + sum_weight = n_nhs; + } + + /* Save copies of all next hop weights to avoid being overwritten in loop below. */ + for (i = 0; i < n_nhs; i++) + nhs[n_nhs + i].path_weight = nhs[i].path_weight; + + /* Try larger and larger power of 2 sized adjacency blocks until we + find one where traffic flows to within 1% of specified weights. */ + for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2) + { + error = 0; + + norm = n_adj / ((f64) sum_weight); + n_adj_left = n_adj; + for (i = 0; i < n_nhs; i++) + { + f64 nf = nhs[n_nhs + i].path_weight * norm; /* use saved weights */ + word n = flt_round_nearest (nf); + + n = n > n_adj_left ? n_adj_left : n; + n_adj_left -= n; + error += fabs (nf - n); + nhs[i].path_weight = n; + + if (0 == nhs[i].path_weight) + { + /* + * when the weight skew is high (norm is small) and n == nf. + * without this correction the path with a low weight would have + * no represenation in the load-balanace - don't want that. + * If the weight skew is high so the load-balance has many buckets + * to allow it. pays ya money takes ya choice. + */ + error = n_adj; + break; + } + } + + nhs[0].path_weight += n_adj_left; + + /* Less than 5% average error per adjacency with this size adjacency block? */ + if (error <= multipath_next_hop_error_tolerance*n_adj) + { + /* Truncate any next hops with zero weight. */ + _vec_len (nhs) = i; + break; + } + } + +done: + /* Save vector for next call. */ + *normalized_next_hops = nhs; + *sum_weight_in = sum_weight; + return n_adj; +} + +static load_balance_path_t * +load_balance_multipath_next_hop_fixup (const load_balance_path_t *nhs, + dpo_proto_t drop_proto) +{ + if (0 == vec_len(nhs)) + { + load_balance_path_t *new_nhs = NULL, *nh; + + /* + * we need something for the load-balance. so use the drop + */ + vec_add2(new_nhs, nh, 1); + + nh->path_weight = 1; + dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto)); + + return (new_nhs); + } + + return (NULL); +} + +/* + * Fill in adjacencies in block based on corresponding + * next hop adjacencies. + */ +static void +load_balance_fill_buckets (load_balance_t *lb, + load_balance_path_t *nhs, + dpo_id_t *buckets, + u32 n_buckets) +{ + load_balance_path_t * nh; + u16 ii, bucket; + + bucket = 0; + + /* + * the next-hops have normalised weights. that means their sum is the number + * of buckets we need to fill. + */ + vec_foreach (nh, nhs) + { + for (ii = 0; ii < nh->path_weight; ii++) + { + ASSERT(bucket < n_buckets); + load_balance_set_bucket_i(lb, bucket++, buckets, &nh->path_dpo); + } + } +} + +static inline void +load_balance_set_n_buckets (load_balance_t *lb, + u32 n_buckets) +{ + lb->lb_n_buckets = n_buckets; + lb->lb_n_buckets_minus_1 = n_buckets-1; +} + +void +load_balance_multipath_update (const dpo_id_t *dpo, + const load_balance_path_t * raw_nhs, + load_balance_flags_t flags) +{ + load_balance_path_t *nh, *nhs, *fixed_nhs; + u32 sum_of_weights, n_buckets, ii; + index_t lbmi, old_lbmi; + load_balance_t *lb; + dpo_id_t *tmp_dpo; + + nhs = NULL; + + ASSERT(DPO_LOAD_BALANCE == dpo->dpoi_type); + lb = load_balance_get(dpo->dpoi_index); + fixed_nhs = load_balance_multipath_next_hop_fixup(raw_nhs, lb->lb_proto); + n_buckets = + ip_multipath_normalize_next_hops((NULL == fixed_nhs ? + raw_nhs : + fixed_nhs), + &nhs, + &sum_of_weights, + multipath_next_hop_error_tolerance); + + ASSERT (n_buckets >= vec_len (raw_nhs)); + + /* + * Save the old load-balance map used, and get a new one if required. + */ + old_lbmi = lb->lb_map; + if (flags & LOAD_BALANCE_FLAG_USES_MAP) + { + lbmi = load_balance_map_add_or_lock(n_buckets, sum_of_weights, nhs); + } + else + { + lbmi = INDEX_INVALID; + } + + if (0 == lb->lb_n_buckets) + { + /* + * first time initialisation. no packets inflight, so we can write + * at leisure. + */ + load_balance_set_n_buckets(lb, n_buckets); + + if (!LB_HAS_INLINE_BUCKETS(lb)) + vec_validate_aligned(lb->lb_buckets, + lb->lb_n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + + load_balance_fill_buckets(lb, nhs, + load_balance_get_buckets(lb), + n_buckets); + lb->lb_map = lbmi; + } + else + { + /* + * This is a modification of an existing load-balance. + * We need to ensure that packets inflight see a consistent state, that + * is the number of reported buckets the LB has (read from + * lb_n_buckets_minus_1) is not more than it actually has. So if the + * number of buckets is increasing, we must update the bucket array first, + * then the reported number. vice-versa if the number of buckets goes down. + */ + if (n_buckets == lb->lb_n_buckets) + { + /* + * no change in the number of buckets. we can simply fill what + * is new over what is old. + */ + load_balance_fill_buckets(lb, nhs, + load_balance_get_buckets(lb), + n_buckets); + lb->lb_map = lbmi; + } + else if (n_buckets > lb->lb_n_buckets) + { + /* + * we have more buckets. the old load-balance map (if there is one) + * will remain valid, i.e. mapping to indices within range, so we + * update it last. + */ + if (n_buckets > LB_NUM_INLINE_BUCKETS && + lb->lb_n_buckets <= LB_NUM_INLINE_BUCKETS) + { + /* + * the new increased number of buckets is crossing the threshold + * from the inline storage to out-line. Alloc the outline buckets + * first, then fixup the number. then reset the inlines. + */ + ASSERT(NULL == lb->lb_buckets); + vec_validate_aligned(lb->lb_buckets, + n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + + load_balance_fill_buckets(lb, nhs, + lb->lb_buckets, + n_buckets); + CLIB_MEMORY_BARRIER(); + load_balance_set_n_buckets(lb, n_buckets); + + CLIB_MEMORY_BARRIER(); + + for (ii = 0; ii < LB_NUM_INLINE_BUCKETS; ii++) + { + dpo_reset(&lb->lb_buckets_inline[ii]); + } + } + else + { + if (n_buckets <= LB_NUM_INLINE_BUCKETS) + { + /* + * we are not crossing the threshold and it's still inline buckets. + * we can write the new on the old.. + */ + load_balance_fill_buckets(lb, nhs, + load_balance_get_buckets(lb), + n_buckets); + CLIB_MEMORY_BARRIER(); + load_balance_set_n_buckets(lb, n_buckets); + } + else + { + /* + * we are not crossing the threshold. We need a new bucket array to + * hold the increased number of choices. + */ + dpo_id_t *new_buckets, *old_buckets, *tmp_dpo; + + new_buckets = NULL; + old_buckets = load_balance_get_buckets(lb); + + vec_validate_aligned(new_buckets, + n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + + load_balance_fill_buckets(lb, nhs, new_buckets, n_buckets); + CLIB_MEMORY_BARRIER(); + lb->lb_buckets = new_buckets; + CLIB_MEMORY_BARRIER(); + load_balance_set_n_buckets(lb, n_buckets); + + vec_foreach(tmp_dpo, old_buckets) + { + dpo_reset(tmp_dpo); + } + vec_free(old_buckets); + } + } + + /* + * buckets fixed. ready for the MAP update. + */ + lb->lb_map = lbmi; + } + else + { + /* + * bucket size shrinkage. + * Any map we have will be based on the old + * larger number of buckets, so will be translating to indices + * out of range. So the new MAP must be installed first. + */ + lb->lb_map = lbmi; + CLIB_MEMORY_BARRIER(); + + + if (n_buckets <= LB_NUM_INLINE_BUCKETS && + lb->lb_n_buckets > LB_NUM_INLINE_BUCKETS) + { + /* + * the new decreased number of buckets is crossing the threshold + * from out-line storage to inline: + * 1 - Fill the inline buckets, + * 2 - fixup the number (and this point the inline buckets are + * used). + * 3 - free the outline buckets + */ + load_balance_fill_buckets(lb, nhs, + lb->lb_buckets_inline, + n_buckets); + CLIB_MEMORY_BARRIER(); + load_balance_set_n_buckets(lb, n_buckets); + CLIB_MEMORY_BARRIER(); + + vec_foreach(tmp_dpo, lb->lb_buckets) + { + dpo_reset(tmp_dpo); + } + vec_free(lb->lb_buckets); + } + else + { + /* + * not crossing the threshold. + * 1 - update the number to the smaller size + * 2 - write the new buckets + * 3 - reset those no longer used. + */ + dpo_id_t *buckets; + u32 old_n_buckets; + + old_n_buckets = lb->lb_n_buckets; + buckets = load_balance_get_buckets(lb); + + load_balance_set_n_buckets(lb, n_buckets); + CLIB_MEMORY_BARRIER(); + + load_balance_fill_buckets(lb, nhs, + buckets, + n_buckets); + + for (ii = n_buckets; ii < old_n_buckets; ii++) + { + dpo_reset(&buckets[ii]); + } + } + } + } + + vec_foreach (nh, nhs) + { + dpo_reset(&nh->path_dpo); + } + vec_free(nhs); + vec_free(fixed_nhs); + + load_balance_map_unlock(old_lbmi); +} + +static void +load_balance_lock (dpo_id_t *dpo) +{ + load_balance_t *lb; + + lb = load_balance_get(dpo->dpoi_index); + + lb->lb_locks++; +} + +static void +load_balance_destroy (load_balance_t *lb) +{ + dpo_id_t *buckets; + int i; + + buckets = load_balance_get_buckets(lb); + + for (i = 0; i < lb->lb_n_buckets; i++) + { + dpo_reset(&buckets[i]); + } + + LB_DBG(lb, "destroy"); + if (!LB_HAS_INLINE_BUCKETS(lb)) + { + vec_free(lb->lb_buckets); + } + + fib_urpf_list_unlock(lb->lb_urpf); + load_balance_map_unlock(lb->lb_map); + + pool_put(load_balance_pool, lb); +} + +static void +load_balance_unlock (dpo_id_t *dpo) +{ + load_balance_t *lb; + + lb = load_balance_get(dpo->dpoi_index); + + lb->lb_locks--; + + if (0 == lb->lb_locks) + { + load_balance_destroy(lb); + } +} + +static void +load_balance_mem_show (void) +{ + fib_show_memory_usage("load-balance", + pool_elts(load_balance_pool), + pool_len(load_balance_pool), + sizeof(load_balance_t)); + load_balance_map_show_mem(); +} + +const static dpo_vft_t lb_vft = { + .dv_lock = load_balance_lock, + .dv_unlock = load_balance_unlock, + .dv_format = format_load_balance_dpo, + .dv_mem_show = load_balance_mem_show, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a load-balance + * object. + * + * this means that these graph nodes are ones from which a load-balance is the + * parent object in the DPO-graph. + * + * We do not list all the load-balance nodes, such as the *-lookup. instead + * we are relying on the correct use of the .sibling_of field when setting + * up these sibling nodes. + */ +const static char* const load_balance_ip4_nodes[] = +{ + "ip4-load-balance", + NULL, +}; +const static char* const load_balance_ip6_nodes[] = +{ + "ip6-load-balance", + NULL, +}; +const static char* const load_balance_mpls_nodes[] = +{ + "mpls-load-balance", + NULL, +}; +const static char* const load_balance_l2_nodes[] = +{ + "l2-load-balance", + NULL, +}; +const static char* const load_balance_nsh_nodes[] = +{ + "nsh-load-balance", + NULL, +}; +const static char* const * const load_balance_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = load_balance_ip4_nodes, + [DPO_PROTO_IP6] = load_balance_ip6_nodes, + [DPO_PROTO_MPLS] = load_balance_mpls_nodes, + [DPO_PROTO_ETHERNET] = load_balance_l2_nodes, + [DPO_PROTO_NSH] = load_balance_nsh_nodes, +}; + +void +load_balance_module_init (void) +{ + index_t lbi; + + dpo_register(DPO_LOAD_BALANCE, &lb_vft, load_balance_nodes); + + /* + * Special LB with index zero. we need to define this since the v4 mtrie + * assumes an index of 0 implies the ply is empty. therefore all 'real' + * adjs need a non-zero index. + * This should never be used, but just in case, stack it on a drop. + */ + lbi = load_balance_create(1, DPO_PROTO_IP4, 0); + load_balance_set_bucket(lbi, 0, drop_dpo_get(DPO_PROTO_IP4)); + + load_balance_map_module_init(); +} + +static clib_error_t * +load_balance_show (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + index_t lbi = INDEX_INVALID; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%d", &lbi)) + ; + else + break; + } + + if (INDEX_INVALID != lbi) + { + vlib_cli_output (vm, "%U", format_load_balance, lbi, + LOAD_BALANCE_FORMAT_DETAIL); + } + else + { + load_balance_t *lb; + + pool_foreach(lb, load_balance_pool, + ({ + vlib_cli_output (vm, "%U", format_load_balance, + load_balance_get_index(lb), + LOAD_BALANCE_FORMAT_NONE); + })); + } + + return 0; +} + +VLIB_CLI_COMMAND (load_balance_show_command, static) = { + .path = "show load-balance", + .short_help = "show load-balance [<index>]", + .function = load_balance_show, +}; + + +always_inline u32 +ip_flow_hash (void *data) +{ + ip4_header_t *iph = (ip4_header_t *) data; + + if ((iph->ip_version_and_header_length & 0xF0) == 0x40) + return ip4_compute_flow_hash (iph, IP_FLOW_HASH_DEFAULT); + else + return ip6_compute_flow_hash ((ip6_header_t *) iph, IP_FLOW_HASH_DEFAULT); +} + +always_inline u64 +mac_to_u64 (u8 * m) +{ + return (*((u64 *) m) & 0xffffffffffff); +} + +always_inline u32 +l2_flow_hash (vlib_buffer_t * b0) +{ + ethernet_header_t *eh; + u64 a, b, c; + uword is_ip, eh_size; + u16 eh_type; + + eh = vlib_buffer_get_current (b0); + eh_type = clib_net_to_host_u16 (eh->type); + eh_size = ethernet_buffer_header_size (b0); + + is_ip = (eh_type == ETHERNET_TYPE_IP4 || eh_type == ETHERNET_TYPE_IP6); + + /* since we have 2 cache lines, use them */ + if (is_ip) + a = ip_flow_hash ((u8 *) vlib_buffer_get_current (b0) + eh_size); + else + a = eh->type; + + b = mac_to_u64 ((u8 *) eh->dst_address); + c = mac_to_u64 ((u8 *) eh->src_address); + hash_mix64 (a, b, c); + + return (u32) c; +} + +typedef struct load_balance_trace_t_ +{ + index_t lb_index; +} load_balance_trace_t; + +static uword +l2_load_balance (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, next_index, *from, *to_next; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + u32 bi0, lbi0, next0; + const dpo_id_t *dpo0; + const load_balance_t *lb0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* lookup dst + src mac */ + lbi0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; + lb0 = load_balance_get(lbi0); + + vnet_buffer(b0)->ip.flow_hash = l2_flow_hash(b0); + + dpo0 = load_balance_get_bucket_i(lb0, + vnet_buffer(b0)->ip.flow_hash & + (lb0->lb_n_buckets_minus_1)); + + next0 = dpo0->dpoi_next_node; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + load_balance_trace_t *tr = vlib_add_trace (vm, node, b0, + sizeof (*tr)); + tr->lb_index = lbi0; + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static u8 * +format_l2_load_balance_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *); + + s = format (s, "L2-load-balance: index %d", t->lb_index); + return s; +} + +/** + * @brief + */ +VLIB_REGISTER_NODE (l2_load_balance_node) = { + .function = l2_load_balance, + .name = "l2-load-balance", + .vector_size = sizeof (u32), + + .format_trace = format_l2_load_balance_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; + +static uword +nsh_load_balance (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, next_index, *from, *to_next; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + u32 bi0, lbi0, next0, *nsh0; + const dpo_id_t *dpo0; + const load_balance_t *lb0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + lbi0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; + lb0 = load_balance_get(lbi0); + + /* SPI + SI are the second word of the NSH header */ + nsh0 = vlib_buffer_get_current (b0); + vnet_buffer(b0)->ip.flow_hash = nsh0[1] % lb0->lb_n_buckets; + + dpo0 = load_balance_get_bucket_i(lb0, + vnet_buffer(b0)->ip.flow_hash & + (lb0->lb_n_buckets_minus_1)); + + next0 = dpo0->dpoi_next_node; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + load_balance_trace_t *tr = vlib_add_trace (vm, node, b0, + sizeof (*tr)); + tr->lb_index = lbi0; + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static u8 * +format_nsh_load_balance_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + load_balance_trace_t *t = va_arg (*args, load_balance_trace_t *); + + s = format (s, "NSH-load-balance: index %d", t->lb_index); + return s; +} + +/** + * @brief + */ +VLIB_REGISTER_NODE (nsh_load_balance_node) = { + .function = nsh_load_balance, + .name = "nsh-load-balance", + .vector_size = sizeof (u32), + + .format_trace = format_nsh_load_balance_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; diff --git a/src/vnet/dpo/load_balance.h b/src/vnet/dpo/load_balance.h new file mode 100644 index 00000000..b901c5be --- /dev/null +++ b/src/vnet/dpo/load_balance.h @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * \brief + * The load-balance object represents an ECMP choice. The buckets of a load + * balance object point to the sub-graph after the choice is made. + * THe load-balance object is also object type returned from a FIB table lookup. + * As such it needs to represent the case where there is only one coice. It may + * seem like overkill to use a load-balance object in this case, but the reason + * is for performance. If the load-balance object were not the result of the FIB + * lookup, then some other object would be. The case where there was ECMP + * this other object would need a load-balance as a parent and hence just add + * an unnecessary indirection. + * + * It is also the object in the DP that represents a via-fib-entry in a recursive + * route. + * + */ + +#ifndef __LOAD_BALANCE_H__ +#define __LOAD_BALANCE_H__ + +#include <vlib/vlib.h> +#include <vnet/ip/lookup.h> +#include <vnet/dpo/dpo.h> +#include <vnet/fib/fib_types.h> +#include <vnet/fib/fib_entry.h> + +/** + * Load-balance main + */ +typedef struct load_balance_main_t_ +{ + vlib_combined_counter_main_t lbm_to_counters; + vlib_combined_counter_main_t lbm_via_counters; +} load_balance_main_t; + +extern load_balance_main_t load_balance_main; + +/** + * The number of buckets that a load-balance object can have and still + * fit in one cache-line + */ +#define LB_NUM_INLINE_BUCKETS 4 + +/** + * @brief One path from an [EU]CMP set that the client wants to add to a + * load-balance object + */ +typedef struct load_balance_path_t_ { + /** + * ID of the Data-path object. + */ + dpo_id_t path_dpo; + + /** + * The index of the FIB path + */ + fib_node_index_t path_index; + + /** + * weight for the path. + */ + u32 path_weight; +} load_balance_path_t; + +/** + * The FIB DPO provieds; + * - load-balancing over the next DPOs in the chain/graph + * - per-route counters + */ +typedef struct load_balance_t_ { + /** + * number of buckets in the load-balance. always a power of 2. + */ + u16 lb_n_buckets; + /** + * number of buckets in the load-balance - 1. used in the switch path + * as part of the hash calculation. + */ + u16 lb_n_buckets_minus_1; + + /** + * The protocol of packets that traverse this LB. + * need in combination with the flow hash config to determine how to hash. + * u8. + */ + dpo_proto_t lb_proto; + + /** + * Flags from the load-balance's associated fib_entry_t + */ + fib_entry_flag_t lb_fib_entry_flags; + + /** + * The number of locks, which is approximately the number of users, + * of this load-balance. + * Load-balance objects of via-entries are heavily shared by recursives, + * so the lock count is a u32. + */ + u32 lb_locks; + + /** + * index of the load-balance map, INVALID if this LB does not use one + */ + index_t lb_map; + + /** + * This is the index of the uRPF list for this LB + */ + index_t lb_urpf; + + /** + * the hash config to use when selecting a bucket. this is a u16 + */ + flow_hash_config_t lb_hash_config; + + /** + * Vector of buckets containing the next DPOs, sized as lbo_num + */ + dpo_id_t *lb_buckets; + + /** + * The rest of the cache line is used for buckets. In the common case + * where there there are less than 4 buckets, then the buckets are + * on the same cachlie and we save ourselves a pointer dereferance in + * the data-path. + */ + dpo_id_t lb_buckets_inline[LB_NUM_INLINE_BUCKETS]; +} load_balance_t; + +STATIC_ASSERT(sizeof(load_balance_t) <= CLIB_CACHE_LINE_BYTES, + "A load_balance object size exceeds one cachline"); + +/** + * Flags controlling load-balance formatting/display + */ +typedef enum load_balance_format_flags_t_ { + LOAD_BALANCE_FORMAT_NONE, + LOAD_BALANCE_FORMAT_DETAIL = (1 << 0), +} load_balance_format_flags_t; + +/** + * Flags controlling load-balance creation and modification + */ +typedef enum load_balance_flags_t_ { + LOAD_BALANCE_FLAG_NONE = 0, + LOAD_BALANCE_FLAG_USES_MAP = (1 << 0), +} load_balance_flags_t; + +extern index_t load_balance_create(u32 num_buckets, + dpo_proto_t lb_proto, + flow_hash_config_t fhc); +extern void load_balance_multipath_update( + const dpo_id_t *dpo, + const load_balance_path_t * raw_next_hops, + load_balance_flags_t flags); + +extern void load_balance_set_bucket(index_t lbi, + u32 bucket, + const dpo_id_t *next); +extern void load_balance_set_urpf(index_t lbi, + index_t urpf); +extern void load_balance_set_fib_entry_flags(index_t lbi, + fib_entry_flag_t flags); +extern index_t load_balance_get_urpf(index_t lbi); + +extern u8* format_load_balance(u8 * s, va_list * args); + +extern const dpo_id_t *load_balance_get_bucket(index_t lbi, + u32 bucket); +extern int load_balance_is_drop(const dpo_id_t *dpo); + +extern f64 load_balance_get_multipath_tolerance(void); + +/** + * The encapsulation breakages are for fast DP access + */ +extern load_balance_t *load_balance_pool; +static inline load_balance_t* +load_balance_get (index_t lbi) +{ + return (pool_elt_at_index(load_balance_pool, lbi)); +} + +#define LB_HAS_INLINE_BUCKETS(_lb) \ + ((_lb)->lb_n_buckets <= LB_NUM_INLINE_BUCKETS) + +static inline const dpo_id_t * +load_balance_get_bucket_i (const load_balance_t *lb, + u32 bucket) +{ + ASSERT(bucket < lb->lb_n_buckets); + + if (PREDICT_TRUE(LB_HAS_INLINE_BUCKETS(lb))) + { + return (&lb->lb_buckets_inline[bucket]); + } + else + { + return (&lb->lb_buckets[bucket]); + } +} + +extern void load_balance_module_init(void); + +#endif diff --git a/src/vnet/dpo/load_balance_map.c b/src/vnet/dpo/load_balance_map.c new file mode 100644 index 00000000..4e27e5db --- /dev/null +++ b/src/vnet/dpo/load_balance_map.c @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + */ +#include <vnet/fib/fib_path.h> +#include <vnet/fib/fib_node_list.h> +#include <vnet/dpo/load_balance_map.h> +#include <vnet/dpo/load_balance.h> + +/** + * A hash-table of load-balance maps by path index. + * this provides the fast lookup of the LB map when a path goes down + */ +static uword *lb_maps_by_path_index; + +/** + * A hash-table of load-balance maps by set of paths. + * This provides the LB map sharing. + * LB maps do not necessarily use all the paths in the list, since + * the entry that is requesting the map, may not have an out-going + * label for each of the paths. + */ +static uword *load_balance_map_db; + +typedef enum load_balance_map_path_flags_t_ +{ + LOAD_BALANCE_MAP_PATH_UP = (1 << 0), + LOAD_BALANCE_MAP_PATH_USABLE = (1 << 1), +} __attribute__ ((packed)) load_balance_map_path_flags_t; + +typedef struct load_balance_map_path_t_ { + /** + * Index of the path + */ + fib_node_index_t lbmp_index; + + /** + * Sibling Index in the list of all maps with this path index + */ + fib_node_index_t lbmp_sibling; + + /** + * the normalised wegiht of the path + */ + u32 lbmp_weight; + + /** + * The sate of the path + */ + load_balance_map_path_flags_t lbmp_flags; +} load_balance_map_path_t; + +/** + * The global pool of LB maps + */ +load_balance_map_t *load_balance_map_pool; + +/* + * Debug macro + */ +#ifdef FIB_DEBUG +#define LOAD_BALANCE_MAP_DBG(_pl, _fmt, _args...) \ + { \ + clib_warning("lbm: FIXME" _fmt, \ + ##_args); \ + } +#else +#define LOAD_BALANCE_MAP_DBG(_pl, _fmt, _args...) +#endif + +static index_t +load_balance_map_get_index (load_balance_map_t *lbm) +{ + return (lbm - load_balance_map_pool); +} + +u8* +format_load_balance_map (u8 *s, va_list ap) +{ + index_t lbmi = va_arg(ap, index_t); + u32 indent = va_arg(ap, u32); + load_balance_map_t *lbm; + u32 n_buckets, ii; + + lbm = load_balance_map_get(lbmi); + n_buckets = vec_len(lbm->lbm_buckets); + + s = format(s, "load-balance-map: index:%d buckets:%d", lbmi, n_buckets); + s = format(s, "\n%U index:", format_white_space, indent+2); + for (ii = 0; ii < n_buckets; ii++) + { + s = format(s, "%5d", ii); + } + s = format(s, "\n%U map:", format_white_space, indent+2); + for (ii = 0; ii < n_buckets; ii++) + { + s = format(s, "%5d", lbm->lbm_buckets[ii]); + } + + return (s); +} + + +static uword +load_balance_map_hash (load_balance_map_t *lbm) +{ + u32 old_lbm_hash, new_lbm_hash, hash; + load_balance_map_path_t *lb_path; + + new_lbm_hash = old_lbm_hash = vec_len(lbm->lbm_paths); + + vec_foreach (lb_path, lbm->lbm_paths) + { + hash = lb_path->lbmp_index; + hash_mix32(hash, old_lbm_hash, new_lbm_hash); + } + + return (new_lbm_hash); +} + +always_inline uword +load_balance_map_db_hash_key_from_index (uword index) +{ + return 1 + 2*index; +} + +always_inline uword +load_balance_map_db_hash_key_is_index (uword key) +{ + return key & 1; +} + +always_inline uword +load_balance_map_db_hash_key_2_index (uword key) +{ + ASSERT (load_balance_map_db_hash_key_is_index (key)); + return key / 2; +} + +static load_balance_map_t* +load_balance_map_db_get_from_hash_key (uword key) +{ + load_balance_map_t *lbm; + + if (load_balance_map_db_hash_key_is_index (key)) + { + index_t lbm_index; + + lbm_index = load_balance_map_db_hash_key_2_index(key); + lbm = load_balance_map_get(lbm_index); + } + else + { + lbm = uword_to_pointer (key, load_balance_map_t *); + } + + return (lbm); +} + +static uword +load_balance_map_db_hash_key_sum (hash_t * h, + uword key) +{ + load_balance_map_t *lbm; + + lbm = load_balance_map_db_get_from_hash_key(key); + + return (load_balance_map_hash(lbm)); +} + +static uword +load_balance_map_db_hash_key_equal (hash_t * h, + uword key1, + uword key2) +{ + load_balance_map_t *lbm1, *lbm2; + + lbm1 = load_balance_map_db_get_from_hash_key(key1); + lbm2 = load_balance_map_db_get_from_hash_key(key2); + + return (load_balance_map_hash(lbm1) == + load_balance_map_hash(lbm2)); +} + +static index_t +load_balance_map_db_find (load_balance_map_t *lbm) +{ + uword *p; + + p = hash_get(load_balance_map_db, lbm); + + if (NULL != p) + { + return p[0]; + } + + return (FIB_NODE_INDEX_INVALID); +} + +static void +load_balance_map_db_insert (load_balance_map_t *lbm) +{ + load_balance_map_path_t *lbmp; + fib_node_list_t list; + uword *p; + + ASSERT(FIB_NODE_INDEX_INVALID == load_balance_map_db_find(lbm)); + + /* + * insert into the DB based on the set of paths. + */ + hash_set (load_balance_map_db, + load_balance_map_db_hash_key_from_index( + load_balance_map_get_index(lbm)), + load_balance_map_get_index(lbm)); + + /* + * insert into each per-path list. + */ + vec_foreach(lbmp, lbm->lbm_paths) + { + p = hash_get(lb_maps_by_path_index, lbmp->lbmp_index); + + if (NULL == p) + { + list = fib_node_list_create(); + hash_set(lb_maps_by_path_index, lbmp->lbmp_index, list); + } + else + { + list = p[0]; + } + + lbmp->lbmp_sibling = + fib_node_list_push_front(list, + 0, FIB_NODE_TYPE_FIRST, + load_balance_map_get_index(lbm)); + } + + LOAD_BALANCE_MAP_DBG(lbm, "DB-inserted"); +} + +static void +load_balance_map_db_remove (load_balance_map_t *lbm) +{ + load_balance_map_path_t *lbmp; + uword *p; + + ASSERT(FIB_NODE_INDEX_INVALID != load_balance_map_db_find(lbm)); + + hash_unset(load_balance_map_db, + load_balance_map_db_hash_key_from_index( + load_balance_map_get_index(lbm))); + + /* + * remove from each per-path list. + */ + vec_foreach(lbmp, lbm->lbm_paths) + { + p = hash_get(lb_maps_by_path_index, lbmp->lbmp_index); + + ASSERT(NULL != p); + + fib_node_list_remove(p[0], lbmp->lbmp_sibling); + } + + LOAD_BALANCE_MAP_DBG(lbm, "DB-removed"); +} + +/** + * @brief from the paths that are usable, fill the Map. + */ +static void +load_balance_map_fill (load_balance_map_t *lbm) +{ + load_balance_map_path_t *lbmp; + u32 n_buckets, bucket, ii, jj; + u16 *tmp_buckets; + + tmp_buckets = NULL; + n_buckets = vec_len(lbm->lbm_buckets); + + /* + * run throught the set of paths once, and build a vector of the + * indices that are usable. we do this is a scratch space, since we + * need to refer to it multiple times as we build the real buckets. + */ + vec_validate(tmp_buckets, n_buckets-1); + + bucket = jj = 0; + vec_foreach (lbmp, lbm->lbm_paths) + { + if (fib_path_is_resolved(lbmp->lbmp_index)) + { + for (ii = 0; ii < lbmp->lbmp_weight; ii++) + { + tmp_buckets[jj++] = bucket++; + } + } + else + { + bucket += lbmp->lbmp_weight; + } + } + _vec_len(tmp_buckets) = jj; + + /* + * If the number of temporaries written is as many as we need, implying + * all paths were up, then we can simply copy the scratch area over the + * actual buckets' memory + */ + if (jj == n_buckets) + { + memcpy(lbm->lbm_buckets, + tmp_buckets, + sizeof(lbm->lbm_buckets[0]) * n_buckets); + } + else + { + /* + * one or more paths are down. + */ + if (0 == vec_len(tmp_buckets)) + { + /* + * if the scratch area is empty, then no paths are usable. + * they will all drop. so use them all, lest we account drops + * against only one. + */ + for (bucket = 0; bucket < n_buckets; bucket++) + { + lbm->lbm_buckets[bucket] = bucket; + } + } + else + { + bucket = jj = 0; + vec_foreach (lbmp, lbm->lbm_paths) + { + if (fib_path_is_resolved(lbmp->lbmp_index)) + { + for (ii = 0; ii < lbmp->lbmp_weight; ii++) + { + lbm->lbm_buckets[bucket] = bucket; + bucket++; + } + } + else + { + /* + * path is unusable + * cycle through the scratch space selecting a index. + * this means we load balance, in the intended ratio, + * over the paths that are still usable. + */ + for (ii = 0; ii < lbmp->lbmp_weight; ii++) + { + lbm->lbm_buckets[bucket] = tmp_buckets[jj]; + jj = (jj + 1) % vec_len(tmp_buckets); + bucket++; + } + } + } + } + } + + vec_free(tmp_buckets); +} + +static load_balance_map_t* +load_balance_map_alloc (const load_balance_path_t *paths) +{ + load_balance_map_t *lbm; + u32 ii; + + pool_get_aligned(load_balance_map_pool, lbm, CLIB_CACHE_LINE_BYTES); + memset(lbm, 0, sizeof(*lbm)); + + vec_validate(lbm->lbm_paths, vec_len(paths)-1); + + vec_foreach_index(ii, paths) + { + lbm->lbm_paths[ii].lbmp_index = paths[ii].path_index; + lbm->lbm_paths[ii].lbmp_weight = paths[ii].path_weight; + } + + return (lbm); +} + +static load_balance_map_t * +load_balance_map_init (load_balance_map_t *lbm, + u32 n_buckets, + u32 sum_of_weights) +{ + lbm->lbm_sum_of_norm_weights = sum_of_weights; + vec_validate(lbm->lbm_buckets, n_buckets-1); + + load_balance_map_db_insert(lbm); + + load_balance_map_fill(lbm); + + return (lbm); +} + +static void +load_balance_map_destroy (load_balance_map_t *lbm) +{ + vec_free(lbm->lbm_paths); + vec_free(lbm->lbm_buckets); + pool_put(load_balance_map_pool, lbm); +} + +index_t +load_balance_map_add_or_lock (u32 n_buckets, + u32 sum_of_weights, + const load_balance_path_t *paths) +{ + load_balance_map_t *tmp, *lbm; + index_t lbmi; + + tmp = load_balance_map_alloc(paths); + + lbmi = load_balance_map_db_find(tmp); + + if (INDEX_INVALID == lbmi) + { + lbm = load_balance_map_init(tmp, n_buckets, sum_of_weights); + } + else + { + lbm = load_balance_map_get(lbmi); + load_balance_map_destroy(tmp); + } + + lbm->lbm_locks++; + + return (load_balance_map_get_index(lbm)); +} + +void +load_balance_map_lock (index_t lbmi) +{ + load_balance_map_t *lbm; + + lbm = load_balance_map_get(lbmi); + + lbm->lbm_locks++; +} + +void +load_balance_map_unlock (index_t lbmi) +{ + load_balance_map_t *lbm; + + if (INDEX_INVALID == lbmi) + { + return; + } + + lbm = load_balance_map_get(lbmi); + + lbm->lbm_locks--; + + if (0 == lbm->lbm_locks) + { + load_balance_map_db_remove(lbm); + load_balance_map_destroy(lbm); + } +} + +static int +load_balance_map_path_state_change_walk (fib_node_ptr_t *fptr, + void *ctx) +{ + load_balance_map_t *lbm; + + lbm = load_balance_map_get(fptr->fnp_index); + + load_balance_map_fill(lbm); + + return (!0); +} + +/** + * @brief the state of a path has changed (it has no doubt gone down). + * This is the trigger to perform a PIC edge cutover and update the maps + * to exclude this path. + */ +void +load_balance_map_path_state_change (fib_node_index_t path_index) +{ + uword *p; + + /* + * re-stripe the buckets for each affect MAP + */ + p = hash_get(lb_maps_by_path_index, path_index); + + if (NULL == p) + return; + + fib_node_list_walk(p[0], load_balance_map_path_state_change_walk, NULL); +} + +/** + * @brief Make/add a new or lock an existing Load-balance map + */ +void +load_balance_map_module_init (void) +{ + load_balance_map_db = + hash_create2 (/* elts */ 0, + /* user */ 0, + /* value_bytes */ sizeof (index_t), + load_balance_map_db_hash_key_sum, + load_balance_map_db_hash_key_equal, + /* format pair/arg */ + 0, 0); + + lb_maps_by_path_index = hash_create(0, sizeof(fib_node_list_t)); +} + +void +load_balance_map_show_mem (void) +{ + fib_show_memory_usage("Load-Balance Map", + pool_elts(load_balance_map_pool), + pool_len(load_balance_map_pool), + sizeof(load_balance_map_t)); +} + +static clib_error_t * +load_balance_map_show (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + index_t lbmi = INDEX_INVALID; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%d", &lbmi)) + ; + else + break; + } + + if (INDEX_INVALID != lbmi) + { + vlib_cli_output (vm, "%U", format_load_balance_map, lbmi, 0); + } + else + { + load_balance_map_t *lbm; + + pool_foreach(lbm, load_balance_map_pool, + ({ + vlib_cli_output (vm, "%U", format_load_balance_map, + load_balance_map_get_index(lbm), 0); + })); + } + + return 0; +} + +VLIB_CLI_COMMAND (load_balance_map_show_command, static) = { + .path = "show load-balance-map", + .short_help = "show load-balance-map [<index>]", + .function = load_balance_map_show, +}; diff --git a/src/vnet/dpo/load_balance_map.h b/src/vnet/dpo/load_balance_map.h new file mode 100644 index 00000000..237f24b0 --- /dev/null +++ b/src/vnet/dpo/load_balance_map.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + */ + +#ifndef __LOAD_BALANCE_MAP_H__ +#define __LOAD_BALANCE_MAP_H__ + +#include <vlib/vlib.h> +#include <vnet/fib/fib_types.h> +#include <vnet/dpo/load_balance.h> + +struct load_balance_map_path_t_; + +/** + */ +typedef struct load_balance_map_t_ { + /** + * The buckets of the map that provide the index to index translation. + * In the first cacheline. + */ + u16 *lbm_buckets; + + /** + * the vector of paths this MAP represents + */ + struct load_balance_map_path_t_ *lbm_paths; + + /** + * the sum of the normalised weights. cache for convenience + */ + u32 lbm_sum_of_norm_weights; + + /** + * Number of locks. Maps are shared by a large number of recrusvie fib_entry_ts + */ + u32 lbm_locks; +} load_balance_map_t; + +extern index_t load_balance_map_add_or_lock(u32 n_buckets, + u32 sum_of_weights, + const load_balance_path_t *norm_paths); + +extern void load_balance_map_lock(index_t lmbi); +extern void load_balance_map_unlock(index_t lbmi); + +extern void load_balance_map_path_state_change(fib_node_index_t path_index); + +extern u8* format_load_balance_map(u8 *s, va_list ap); +extern void load_balance_map_show_mem(void); + +/** + * The encapsulation breakages are for fast DP access + */ +extern load_balance_map_t *load_balance_map_pool; + +static inline load_balance_map_t* +load_balance_map_get (index_t lbmi) +{ + return (pool_elt_at_index(load_balance_map_pool, lbmi)); +} + +static inline u16 +load_balance_map_translate (index_t lbmi, + u16 bucket) +{ + load_balance_map_t*lbm; + + lbm = load_balance_map_get(lbmi); + + return (lbm->lbm_buckets[bucket]); +} + +static inline const dpo_id_t * +load_balance_get_fwd_bucket (const load_balance_t *lb, + u16 bucket) +{ + ASSERT(bucket < lb->lb_n_buckets); + + if (INDEX_INVALID != lb->lb_map) + { + bucket = load_balance_map_translate(lb->lb_map, bucket); + } + + if (PREDICT_TRUE(LB_HAS_INLINE_BUCKETS(lb))) + { + return (&lb->lb_buckets_inline[bucket]); + } + else + { + return (&lb->lb_buckets[bucket]); + } +} + +extern void load_balance_map_module_init(void); + +#endif diff --git a/src/vnet/dpo/lookup_dpo.c b/src/vnet/dpo/lookup_dpo.c new file mode 100644 index 00000000..af189eda --- /dev/null +++ b/src/vnet/dpo/lookup_dpo.c @@ -0,0 +1,1423 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/ip.h> +#include <vnet/dpo/lookup_dpo.h> +#include <vnet/dpo/load_balance_map.h> +#include <vnet/mpls/mpls_lookup.h> +#include <vnet/fib/fib_table.h> +#include <vnet/fib/ip4_fib.h> +#include <vnet/fib/ip6_fib.h> +#include <vnet/fib/mpls_fib.h> +#include <vnet/mfib/mfib_table.h> +#include <vnet/mfib/ip4_mfib.h> +#include <vnet/mfib/ip6_mfib.h> + +static const char *const lookup_input_names[] = LOOKUP_INPUTS; +static const char *const lookup_cast_names[] = LOOKUP_CASTS; + +/** + * @brief Enumeration of the lookup subtypes + */ +typedef enum lookup_sub_type_t_ +{ + LOOKUP_SUB_TYPE_SRC, + LOOKUP_SUB_TYPE_DST, + LOOKUP_SUB_TYPE_DST_MCAST, + LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE, +} lookup_sub_type_t; +#define LOOKUP_SUB_TYPE_NUM (LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE+1) + +#define FOR_EACH_LOOKUP_SUB_TYPE(_st) \ + for (_st = LOOKUP_SUB_TYPE_IP4_SRC; _st < LOOKUP_SUB_TYPE_NUM; _st++) + +/** + * @brief pool of all MPLS Label DPOs + */ +lookup_dpo_t *lookup_dpo_pool; + +/** + * @brief An array of registered DPO type values for the sub-types + */ +static dpo_type_t lookup_dpo_sub_types[LOOKUP_SUB_TYPE_NUM]; + +static lookup_dpo_t * +lookup_dpo_alloc (void) +{ + lookup_dpo_t *lkd; + + pool_get_aligned(lookup_dpo_pool, lkd, CLIB_CACHE_LINE_BYTES); + + return (lkd); +} + +static index_t +lookup_dpo_get_index (lookup_dpo_t *lkd) +{ + return (lkd - lookup_dpo_pool); +} + +static void +lookup_dpo_add_or_lock_i (fib_node_index_t fib_index, + dpo_proto_t proto, + lookup_cast_t cast, + lookup_input_t input, + lookup_table_t table_config, + dpo_id_t *dpo) +{ + lookup_dpo_t *lkd; + dpo_type_t type; + + lkd = lookup_dpo_alloc(); + lkd->lkd_fib_index = fib_index; + lkd->lkd_proto = proto; + lkd->lkd_input = input; + lkd->lkd_table = table_config; + lkd->lkd_cast = cast; + + /* + * use the input type to select the lookup sub-type + */ + type = 0; + + switch (input) + { + case LOOKUP_INPUT_SRC_ADDR: + type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_SRC]; + break; + case LOOKUP_INPUT_DST_ADDR: + switch (table_config) + { + case LOOKUP_TABLE_FROM_INPUT_INTERFACE: + type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE]; + break; + case LOOKUP_TABLE_FROM_CONFIG: + type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST]; + break; + } + if (LOOKUP_MULTICAST == cast) + { + type = lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_MCAST]; + } + } + + if (0 == type) + { + dpo_reset(dpo); + } + else + { + dpo_set(dpo, type, proto, lookup_dpo_get_index(lkd)); + } +} + +void +lookup_dpo_add_or_lock_w_fib_index (fib_node_index_t fib_index, + dpo_proto_t proto, + lookup_cast_t cast, + lookup_input_t input, + lookup_table_t table_config, + dpo_id_t *dpo) +{ + if (LOOKUP_TABLE_FROM_CONFIG == table_config) + { + if (LOOKUP_UNICAST == cast) + { + fib_table_lock(fib_index, + dpo_proto_to_fib(proto), + FIB_SOURCE_RR); + } + else + { + mfib_table_lock(fib_index, + dpo_proto_to_fib(proto), + MFIB_SOURCE_RR); + } + } + lookup_dpo_add_or_lock_i(fib_index, proto, cast, input, table_config, dpo); +} + +void +lookup_dpo_add_or_lock_w_table_id (u32 table_id, + dpo_proto_t proto, + lookup_cast_t cast, + lookup_input_t input, + lookup_table_t table_config, + dpo_id_t *dpo) +{ + fib_node_index_t fib_index = FIB_NODE_INDEX_INVALID; + + if (LOOKUP_TABLE_FROM_CONFIG == table_config) + { + if (LOOKUP_UNICAST == cast) + { + fib_index = + fib_table_find_or_create_and_lock(dpo_proto_to_fib(proto), + table_id, + FIB_SOURCE_RR); + } + else + { + fib_index = + mfib_table_find_or_create_and_lock(dpo_proto_to_fib(proto), + table_id, + MFIB_SOURCE_RR); + } + } + + ASSERT(FIB_NODE_INDEX_INVALID != fib_index); + lookup_dpo_add_or_lock_i(fib_index, proto, cast, input, table_config, dpo); +} + +u8* +format_lookup_dpo (u8 *s, va_list *args) +{ + index_t index = va_arg (*args, index_t); + lookup_dpo_t *lkd; + + lkd = lookup_dpo_get(index); + + if (LOOKUP_TABLE_FROM_INPUT_INTERFACE == lkd->lkd_table) + { + s = format(s, "%s,%s lookup in interface's %U table", + lookup_input_names[lkd->lkd_input], + lookup_cast_names[lkd->lkd_cast], + format_dpo_proto, lkd->lkd_proto); + } + else + { + if (LOOKUP_UNICAST == lkd->lkd_cast) + { + s = format(s, "%s,%s lookup in %U", + lookup_input_names[lkd->lkd_input], + lookup_cast_names[lkd->lkd_cast], + format_fib_table_name, lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto)); + } + else + { + s = format(s, "%s,%s lookup in %U", + lookup_input_names[lkd->lkd_input], + lookup_cast_names[lkd->lkd_cast], + format_mfib_table_name, lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto)); + } + } + return (s); +} + +static void +lookup_dpo_lock (dpo_id_t *dpo) +{ + lookup_dpo_t *lkd; + + lkd = lookup_dpo_get(dpo->dpoi_index); + + lkd->lkd_locks++; +} + +static void +lookup_dpo_unlock (dpo_id_t *dpo) +{ + lookup_dpo_t *lkd; + + lkd = lookup_dpo_get(dpo->dpoi_index); + + lkd->lkd_locks--; + + if (0 == lkd->lkd_locks) + { + if (LOOKUP_TABLE_FROM_CONFIG == lkd->lkd_table) + { + if (LOOKUP_UNICAST == lkd->lkd_cast) + { + fib_table_unlock(lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto), + FIB_SOURCE_RR); + } + else + { + mfib_table_unlock(lkd->lkd_fib_index, + dpo_proto_to_fib(lkd->lkd_proto), + MFIB_SOURCE_RR); + } + } + pool_put(lookup_dpo_pool, lkd); + } +} + +always_inline void +ip4_src_fib_lookup_one (u32 src_fib_index0, + const ip4_address_t * addr0, + u32 * src_adj_index0) +{ + ip4_fib_mtrie_leaf_t leaf0; + ip4_fib_mtrie_t * mtrie0; + + mtrie0 = &ip4_fib_get (src_fib_index0)->mtrie; + + leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, addr0); + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 2); + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 3); + + src_adj_index0[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf0); +} + +always_inline void +ip4_src_fib_lookup_two (u32 src_fib_index0, + u32 src_fib_index1, + const ip4_address_t * addr0, + const ip4_address_t * addr1, + u32 * src_adj_index0, + u32 * src_adj_index1) +{ + ip4_fib_mtrie_leaf_t leaf0, leaf1; + ip4_fib_mtrie_t * mtrie0, * mtrie1; + + mtrie0 = &ip4_fib_get (src_fib_index0)->mtrie; + mtrie1 = &ip4_fib_get (src_fib_index1)->mtrie; + + leaf0 = ip4_fib_mtrie_lookup_step_one (mtrie0, addr0); + leaf1 = ip4_fib_mtrie_lookup_step_one (mtrie1, addr1); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 2); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 2); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, addr0, 3); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, addr1, 3); + + src_adj_index0[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + src_adj_index1[0] = ip4_fib_mtrie_leaf_get_adj_index (leaf1); +} + +/** + * @brief Lookup trace data + */ +typedef struct lookup_trace_t_ +{ + union { + ip46_address_t addr; + mpls_unicast_header_t hdr; + }; + fib_node_index_t fib_index; + index_t lbi; +} lookup_trace_t; + + +always_inline uword +lookup_dpo_ip4_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + int input_src_addr, + int table_from_interface) +{ + u32 n_left_from, next_index, * from, * to_next; + u32 thread_index = vlib_get_thread_index(); + vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next > 2) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0, hash_c0; + flow_hash_config_t flow_hash_config0; + const ip4_address_t *input_addr0; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const ip4_header_t * ip0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + u32 bi1, lkdi1, lbi1, fib_index1, next1, hash_c1; + flow_hash_config_t flow_hash_config1; + const ip4_address_t *input_addr1; + const load_balance_t *lb1; + const lookup_dpo_t * lkd1; + const ip4_header_t * ip1; + const dpo_id_t *dpo1; + vlib_buffer_t * b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + bi0 = from[0]; + to_next[0] = bi0; + bi1 = from[1]; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (b0); + b1 = vlib_get_buffer (vm, bi1); + ip1 = vlib_buffer_get_current (b1); + + /* dst lookup was done by ip4 lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkdi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + lkd1 = lookup_dpo_get(lkdi1); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + ip4_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + fib_index1 = + ip4_fib_table_get_index_for_sw_if_index( + vnet_buffer(b1)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + fib_index1 = lkd1->lkd_fib_index; + } + + /* + * choose between a source or destination address lookup in the table + */ + if (input_src_addr) + { + input_addr0 = &ip0->src_address; + input_addr1 = &ip1->src_address; + } + else + { + input_addr0 = &ip0->dst_address; + input_addr1 = &ip1->dst_address; + } + + /* do lookup */ + ip4_src_fib_lookup_two (fib_index0, fib_index1, + input_addr0, input_addr1, + &lbi0, &lbi1); + lb0 = load_balance_get(lbi0); + lb1 = load_balance_get(lbi1); + + vnet_buffer(b0)->sw_if_index[VLIB_TX] = fib_index0; + vnet_buffer(b1)->sw_if_index[VLIB_TX] = fib_index1; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (b0)->ip.flow_hash = 0; + hash_c1 = vnet_buffer (b1)->ip.flow_hash = 0; + + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + flow_hash_config0 = lb0->lb_hash_config; + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + ip4_compute_flow_hash (ip0, flow_hash_config0); + } + + if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) + { + flow_hash_config1 = lb1->lb_hash_config; + hash_c1 = vnet_buffer (b1)->ip.flow_hash = + ip4_compute_flow_hash (ip1, flow_hash_config1); + } + + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + dpo1 = load_balance_get_bucket_i(lb1, + (hash_c1 & + (lb1->lb_n_buckets_minus_1))); + + next0 = dpo0->dpoi_next_node; + next1 = dpo1->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + vlib_increment_combined_counter + (cm, thread_index, lbi1, 1, + vlib_buffer_length_in_chain (vm, b1)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->addr.ip4 = *input_addr0; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b1, sizeof (*tr)); + tr->fib_index = fib_index1; + tr->lbi = lbi1; + tr->addr.ip4 = *input_addr1; + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0, hash_c0; + flow_hash_config_t flow_hash_config0; + const ip4_address_t *input_addr; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const ip4_header_t * ip0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (b0); + + /* dst lookup was done by ip4 lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + ip4_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + } + + /* + * choose between a source or destination address lookup in the table + */ + if (input_src_addr) + { + input_addr = &ip0->src_address; + } + else + { + input_addr = &ip0->dst_address; + } + + /* do lookup */ + ip4_src_fib_lookup_one (fib_index0, input_addr, &lbi0); + lb0 = load_balance_get(lbi0); + + vnet_buffer(b0)->sw_if_index[VLIB_TX] = fib_index0; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (b0)->ip.flow_hash = 0; + + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + flow_hash_config0 = lb0->lb_hash_config; + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + ip4_compute_flow_hash (ip0, flow_hash_config0); + } + + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + + next0 = dpo0->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->addr.ip4 = *input_addr; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_lookup_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + lookup_trace_t * t = va_arg (*args, lookup_trace_t *); + uword indent = format_get_indent (s); + s = format (s, "%U fib-index:%d addr:%U load-balance:%d", + format_white_space, indent, + t->fib_index, + format_ip46_address, &t->addr, IP46_TYPE_ANY, + t->lbi); + return s; +} + +always_inline uword +lookup_ip4_dst (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip4_inline(vm, node, from_frame, 0, 0)); +} + +VLIB_REGISTER_NODE (lookup_ip4_dst_node) = { + .function = lookup_ip4_dst, + .name = "lookup-ip4-dst", + .vector_size = sizeof (u32), + .sibling_of = "ip4-lookup", + .format_trace = format_lookup_trace, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_dst_node, lookup_ip4_dst) + +always_inline uword +lookup_ip4_dst_itf (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip4_inline(vm, node, from_frame, 0, 1)); +} + +VLIB_REGISTER_NODE (lookup_ip4_dst_itf_node) = { + .function = lookup_ip4_dst_itf, + .name = "lookup-ip4-dst-itf", + .vector_size = sizeof (u32), + .sibling_of = "ip4-lookup", + .format_trace = format_lookup_trace, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_dst_itf_node, lookup_ip4_dst_itf) + +always_inline uword +lookup_ip4_src (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip4_inline(vm, node, from_frame, 1, 0)); +} + +VLIB_REGISTER_NODE (lookup_ip4_src_node) = { + .function = lookup_ip4_src, + .name = "lookup-ip4-src", + .vector_size = sizeof (u32), + .format_trace = format_lookup_trace, + .sibling_of = "ip4-lookup", +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_src_node, lookup_ip4_src) + +always_inline uword +lookup_dpo_ip6_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + int input_src_addr, + int table_from_interface) +{ + vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; + u32 n_left_from, next_index, * from, * to_next; + u32 thread_index = vlib_get_thread_index(); + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next > 2) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0, hash_c0; + flow_hash_config_t flow_hash_config0; + const ip6_address_t *input_addr0; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const ip6_header_t * ip0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + u32 bi1, lkdi1, lbi1, fib_index1, next1, hash_c1; + flow_hash_config_t flow_hash_config1; + const ip6_address_t *input_addr1; + const load_balance_t *lb1; + const lookup_dpo_t * lkd1; + const ip6_header_t * ip1; + const dpo_id_t *dpo1; + vlib_buffer_t * b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + bi0 = from[0]; + to_next[0] = bi0; + bi1 = from[1]; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (b0); + b1 = vlib_get_buffer (vm, bi1); + ip1 = vlib_buffer_get_current (b1); + + /* dst lookup was done by ip6 lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkdi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + lkd1 = lookup_dpo_get(lkdi1); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + ip6_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + fib_index1 = + ip6_fib_table_get_index_for_sw_if_index( + vnet_buffer(b1)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + fib_index1 = lkd1->lkd_fib_index; + } + + /* + * choose between a source or destination address lookup in the table + */ + if (input_src_addr) + { + input_addr0 = &ip0->src_address; + input_addr1 = &ip1->src_address; + } + else + { + input_addr0 = &ip0->dst_address; + input_addr1 = &ip1->dst_address; + } + + /* do src lookup */ + lbi0 = ip6_fib_table_fwding_lookup(&ip6_main, + fib_index0, + input_addr0); + lbi1 = ip6_fib_table_fwding_lookup(&ip6_main, + fib_index1, + input_addr1); + lb0 = load_balance_get(lbi0); + lb1 = load_balance_get(lbi1); + + vnet_buffer(b0)->sw_if_index[VLIB_TX] = fib_index0; + vnet_buffer(b1)->sw_if_index[VLIB_TX] = fib_index1; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (b0)->ip.flow_hash = 0; + hash_c1 = vnet_buffer (b1)->ip.flow_hash = 0; + + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + flow_hash_config0 = lb0->lb_hash_config; + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + ip6_compute_flow_hash (ip0, flow_hash_config0); + } + + if (PREDICT_FALSE (lb1->lb_n_buckets > 1)) + { + flow_hash_config1 = lb1->lb_hash_config; + hash_c1 = vnet_buffer (b1)->ip.flow_hash = + ip6_compute_flow_hash (ip1, flow_hash_config1); + } + + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + dpo1 = load_balance_get_bucket_i(lb1, + (hash_c1 & + (lb1->lb_n_buckets_minus_1))); + + next0 = dpo0->dpoi_next_node; + next1 = dpo1->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + vnet_buffer(b1)->ip.adj_index[VLIB_TX] = dpo1->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + vlib_increment_combined_counter + (cm, thread_index, lbi1, 1, + vlib_buffer_length_in_chain (vm, b1)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->addr.ip6 = *input_addr0; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b1, sizeof (*tr)); + tr->fib_index = fib_index1; + tr->lbi = lbi1; + tr->addr.ip6 = *input_addr1; + } + vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, + n_left_to_next, bi0, bi1, + next0, next1); + } + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0, hash_c0; + flow_hash_config_t flow_hash_config0; + const ip6_address_t *input_addr0; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const ip6_header_t * ip0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (b0); + + /* dst lookup was done by ip6 lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + ip6_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + } + + /* + * choose between a source or destination address lookup in the table + */ + if (input_src_addr) + { + input_addr0 = &ip0->src_address; + } + else + { + input_addr0 = &ip0->dst_address; + } + + /* do src lookup */ + lbi0 = ip6_fib_table_fwding_lookup(&ip6_main, + fib_index0, + input_addr0); + lb0 = load_balance_get(lbi0); + + vnet_buffer(b0)->sw_if_index[VLIB_TX] = fib_index0; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (b0)->ip.flow_hash = 0; + + if (PREDICT_FALSE (lb0->lb_n_buckets > 1)) + { + flow_hash_config0 = lb0->lb_hash_config; + hash_c0 = vnet_buffer (b0)->ip.flow_hash = + ip6_compute_flow_hash (ip0, flow_hash_config0); + } + + dpo0 = load_balance_get_bucket_i(lb0, + (hash_c0 & + (lb0->lb_n_buckets_minus_1))); + + next0 = dpo0->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->addr.ip6 = *input_addr0; + } + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +always_inline uword +lookup_ip6_dst (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip6_inline(vm, node, from_frame, 0 /*use src*/, 0)); +} + +VLIB_REGISTER_NODE (lookup_ip6_dst_node) = { + .function = lookup_ip6_dst, + .name = "lookup-ip6-dst", + .vector_size = sizeof (u32), + .format_trace = format_lookup_trace, + .sibling_of = "ip6-lookup", +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip6_dst_node, lookup_ip6_dst) + +always_inline uword +lookup_ip6_dst_itf (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip6_inline(vm, node, from_frame, 0 /*use src*/, 1)); +} + +VLIB_REGISTER_NODE (lookup_ip6_dst_itf_node) = { + .function = lookup_ip6_dst_itf, + .name = "lookup-ip6-dst-itf", + .vector_size = sizeof (u32), + .format_trace = format_lookup_trace, + .sibling_of = "ip6-lookup", +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip6_dst_itf_node, lookup_ip6_dst_itf) + +always_inline uword +lookup_ip6_src (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip6_inline(vm, node, from_frame, 1, 0)); +} + +VLIB_REGISTER_NODE (lookup_ip6_src_node) = { + .function = lookup_ip6_src, + .name = "lookup-ip6-src", + .vector_size = sizeof (u32), + .format_trace = format_lookup_trace, + .sibling_of = "ip6-lookup", +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip6_src_node, lookup_ip6_src) + +always_inline uword +lookup_dpo_mpls_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + int table_from_interface) +{ + u32 n_left_from, next_index, * from, * to_next; + u32 thread_index = vlib_get_thread_index(); + vlib_combined_counter_main_t * cm = &load_balance_main.lbm_to_counters; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + /* while (n_left_from >= 4 && n_left_to_next >= 2) */ + /* } */ + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, lkdi0, lbi0, fib_index0, next0, hash0; + const mpls_unicast_header_t * hdr0; + const load_balance_t *lb0; + const lookup_dpo_t * lkd0; + const dpo_id_t *dpo0; + vlib_buffer_t * b0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + hdr0 = vlib_buffer_get_current (b0); + + /* dst lookup was done by mpls lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + + /* + * choose between a lookup using the fib index in the DPO + * or getting the FIB index from the interface. + */ + if (table_from_interface) + { + fib_index0 = + mpls_fib_table_get_index_for_sw_if_index( + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + } + else + { + fib_index0 = lkd0->lkd_fib_index; + } + + /* do lookup */ + lbi0 = mpls_fib_table_forwarding_lookup (fib_index0, hdr0); + lb0 = load_balance_get(lbi0); + dpo0 = load_balance_get_bucket_i(lb0, 0); + + next0 = dpo0->dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + + if (MPLS_IS_REPLICATE & lbi0) + { + next0 = mpls_lookup_to_replicate_edge; + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = + (lbi0 & ~MPLS_IS_REPLICATE); + } + else + { + lb0 = load_balance_get(lbi0); + ASSERT (lb0->lb_n_buckets > 0); + ASSERT (is_pow2 (lb0->lb_n_buckets)); + + if (PREDICT_FALSE(lb0->lb_n_buckets > 1)) + { + hash0 = vnet_buffer (b0)->ip.flow_hash = + mpls_compute_flow_hash(hdr0, lb0->lb_hash_config); + dpo0 = load_balance_get_fwd_bucket + (lb0, + (hash0 & (lb0->lb_n_buckets_minus_1))); + } + else + { + dpo0 = load_balance_get_bucket_i (lb0, 0); + } + next0 = dpo0->dpoi_next_node; + + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + vlib_increment_combined_counter + (cm, thread_index, lbi0, 1, + vlib_buffer_length_in_chain (vm, b0)); + } + + vnet_buffer (b0)->mpls.ttl = ((char*)hdr0)[3]; + vnet_buffer (b0)->mpls.exp = (((char*)hdr0)[2] & 0xe) >> 1; + vnet_buffer (b0)->mpls.first = 1; + vlib_buffer_advance(b0, sizeof(*hdr0)); + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = lbi0; + tr->hdr = *hdr0; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_lookup_mpls_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + lookup_trace_t * t = va_arg (*args, lookup_trace_t *); + uword indent = format_get_indent (s); + mpls_unicast_header_t hdr; + + hdr.label_exp_s_ttl = clib_net_to_host_u32(t->hdr.label_exp_s_ttl); + + s = format (s, "%U fib-index:%d hdr:%U load-balance:%d", + format_white_space, indent, + t->fib_index, + format_mpls_header, hdr, + t->lbi); + return s; +} + +always_inline uword +lookup_mpls_dst (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_mpls_inline(vm, node, from_frame, 0)); +} + +VLIB_REGISTER_NODE (lookup_mpls_dst_node) = { + .function = lookup_mpls_dst, + .name = "lookup-mpls-dst", + .vector_size = sizeof (u32), + .sibling_of = "mpls-lookup", + .format_trace = format_lookup_mpls_trace, + .n_next_nodes = 0, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_mpls_dst_node, lookup_mpls_dst) + +always_inline uword +lookup_mpls_dst_itf (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_mpls_inline(vm, node, from_frame, 1)); +} + +VLIB_REGISTER_NODE (lookup_mpls_dst_itf_node) = { + .function = lookup_mpls_dst_itf, + .name = "lookup-mpls-dst-itf", + .vector_size = sizeof (u32), + .sibling_of = "mpls-lookup", + .format_trace = format_lookup_mpls_trace, + .n_next_nodes = 0, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_mpls_dst_itf_node, lookup_mpls_dst_itf) + +typedef enum lookup_ip_dst_mcast_next_t_ { + LOOKUP_IP_DST_MCAST_NEXT_RPF, + LOOKUP_IP_DST_MCAST_N_NEXT, +} mfib_forward_lookup_next_t; + +always_inline uword +lookup_dpo_ip_dst_mcast_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + int is_v4) +{ + u32 n_left_from, next_index, * from, * to_next; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = LOOKUP_IP_DST_MCAST_NEXT_RPF; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + /* while (n_left_from >= 4 && n_left_to_next >= 2) */ + /* } */ + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0, lkdi0, fib_index0, next0; + const lookup_dpo_t * lkd0; + fib_node_index_t mfei0; + vlib_buffer_t * b0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* dst lookup was done by mpls lookup */ + lkdi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + lkd0 = lookup_dpo_get(lkdi0); + fib_index0 = lkd0->lkd_fib_index; + next0 = LOOKUP_IP_DST_MCAST_NEXT_RPF; + + if (is_v4) + { + ip4_header_t * ip0; + + ip0 = vlib_buffer_get_current (b0); + mfei0 = ip4_mfib_table_lookup(ip4_mfib_get(fib_index0), + &ip0->src_address, + &ip0->dst_address, + 64); + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = mfei0; + tr->addr.ip4 = ip0->dst_address; + } + } + else + { + ip6_header_t * ip0; + + ip0 = vlib_buffer_get_current (b0); + mfei0 = ip6_mfib_table_lookup2(ip6_mfib_get(fib_index0), + &ip0->src_address, + &ip0->dst_address); + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + lookup_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + tr->fib_index = fib_index0; + tr->lbi = mfei0; + tr->addr.ip6 = ip0->dst_address; + } + } + + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = mfei0; + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +always_inline uword +lookup_ip4_dst_mcast (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip_dst_mcast_inline(vm, node, from_frame, 1)); +} + +VLIB_REGISTER_NODE (lookup_ip4_dst_mcast_node) = { + .function = lookup_ip4_dst_mcast, + .name = "lookup-ip4-dst-mcast", + .vector_size = sizeof (u32), + + .format_trace = format_lookup_trace, + .n_next_nodes = LOOKUP_IP_DST_MCAST_N_NEXT, + .next_nodes = { + [LOOKUP_IP_DST_MCAST_NEXT_RPF] = "ip4-mfib-forward-rpf", + }, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip4_dst_mcast_node, + lookup_ip4_dst_mcast) + +always_inline uword +lookup_ip6_dst_mcast (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return (lookup_dpo_ip_dst_mcast_inline(vm, node, from_frame, 0)); +} + +VLIB_REGISTER_NODE (lookup_ip6_dst_mcast_node) = { + .function = lookup_ip6_dst_mcast, + .name = "lookup-ip6-dst-mcast", + .vector_size = sizeof (u32), + + .format_trace = format_lookup_trace, + .n_next_nodes = LOOKUP_IP_DST_MCAST_N_NEXT, + .next_nodes = { + [LOOKUP_IP_DST_MCAST_NEXT_RPF] = "ip6-mfib-forward-rpf", + }, +}; +VLIB_NODE_FUNCTION_MULTIARCH (lookup_ip6_dst_mcast_node, + lookup_ip6_dst_mcast) + +static void +lookup_dpo_mem_show (void) +{ + fib_show_memory_usage("Lookup", + pool_elts(lookup_dpo_pool), + pool_len(lookup_dpo_pool), + sizeof(lookup_dpo_t)); +} + +const static dpo_vft_t lkd_vft = { + .dv_lock = lookup_dpo_lock, + .dv_unlock = lookup_dpo_unlock, + .dv_format = format_lookup_dpo, +}; +const static dpo_vft_t lkd_vft_w_mem_show = { + .dv_lock = lookup_dpo_lock, + .dv_unlock = lookup_dpo_unlock, + .dv_format = format_lookup_dpo, + .dv_mem_show = lookup_dpo_mem_show, +}; + +const static char* const lookup_src_ip4_nodes[] = +{ + "lookup-ip4-src", + NULL, +}; +const static char* const lookup_src_ip6_nodes[] = +{ + "lookup-ip6-src", + NULL, +}; +const static char* const * const lookup_src_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = lookup_src_ip4_nodes, + [DPO_PROTO_IP6] = lookup_src_ip6_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + +const static char* const lookup_dst_ip4_nodes[] = +{ + "lookup-ip4-dst", + NULL, +}; +const static char* const lookup_dst_ip6_nodes[] = +{ + "lookup-ip6-dst", + NULL, +}; +const static char* const lookup_dst_mpls_nodes[] = +{ + "lookup-mpls-dst", + NULL, +}; +const static char* const * const lookup_dst_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = lookup_dst_ip4_nodes, + [DPO_PROTO_IP6] = lookup_dst_ip6_nodes, + [DPO_PROTO_MPLS] = lookup_dst_mpls_nodes, +}; + +const static char* const lookup_dst_mcast_ip4_nodes[] = +{ + "lookup-ip4-dst-mcast", + NULL, +}; +const static char* const lookup_dst_mcast_ip6_nodes[] = +{ + "lookup-ip6-dst-mcast", + NULL, +}; +const static char* const * const lookup_dst_mcast_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = lookup_dst_mcast_ip4_nodes, + [DPO_PROTO_IP6] = lookup_dst_mcast_ip6_nodes, +}; + +const static char* const lookup_dst_from_interface_ip4_nodes[] = +{ + "lookup-ip4-dst-itf", + NULL, +}; +const static char* const lookup_dst_from_interface_ip6_nodes[] = +{ + "lookup-ip6-dst-itf", + NULL, +}; +const static char* const lookup_dst_from_interface_mpls_nodes[] = +{ + "lookup-mpls-dst-itf", + NULL, +}; +const static char* const * const lookup_dst_from_interface_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = lookup_dst_from_interface_ip4_nodes, + [DPO_PROTO_IP6] = lookup_dst_from_interface_ip6_nodes, + [DPO_PROTO_MPLS] = lookup_dst_from_interface_mpls_nodes, +}; + + +void +lookup_dpo_module_init (void) +{ + dpo_register(DPO_LOOKUP, &lkd_vft_w_mem_show, NULL); + + /* + * There are various sorts of lookup; src or dst addr v4 /v6 etc. + * there isn't an object type for each (there is only the lookup_dpo_t), + * but, for performance reasons, there is a data plane function, and hence + * VLIB node for each. VLIB graph node construction is based on DPO types + * so we create sub-types. + */ + lookup_dpo_sub_types[LOOKUP_SUB_TYPE_SRC] = + dpo_register_new_type(&lkd_vft, lookup_src_nodes); + lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST] = + dpo_register_new_type(&lkd_vft, lookup_dst_nodes); + lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_MCAST] = + dpo_register_new_type(&lkd_vft, lookup_dst_mcast_nodes); + lookup_dpo_sub_types[LOOKUP_SUB_TYPE_DST_TABLE_FROM_INTERFACE] = + dpo_register_new_type(&lkd_vft, lookup_dst_from_interface_nodes); +} diff --git a/src/vnet/dpo/lookup_dpo.h b/src/vnet/dpo/lookup_dpo.h new file mode 100644 index 00000000..7dfd0385 --- /dev/null +++ b/src/vnet/dpo/lookup_dpo.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LOOKUP_DPO_H__ +#define __LOOKUP_DPO_H__ + +#include <vnet/vnet.h> +#include <vnet/fib/fib_types.h> +#include <vnet/dpo/dpo.h> + +/** + * Switch to use the packet's source or destination address for lookup + */ +typedef enum lookup_input_t_ { + LOOKUP_INPUT_SRC_ADDR, + LOOKUP_INPUT_DST_ADDR, +} __attribute__ ((packed)) lookup_input_t; + +#define LOOKUP_INPUTS { \ + [LOOKUP_INPUT_SRC_ADDR] = "src-address", \ + [LOOKUP_INPUT_DST_ADDR] = "dst-address", \ +} + +/** + * Switch to use the packet's source or destination address for lookup + */ +typedef enum lookup_table_t_ { + LOOKUP_TABLE_FROM_INPUT_INTERFACE, + LOOKUP_TABLE_FROM_CONFIG, +} __attribute__ ((packed)) lookup_table_t; + +#define LOOKUP_TABLES { \ + [LOOKUP_INPUT_SRC_ADDR] = "table-input-interface", \ + [LOOKUP_INPUT_DST_ADDR] = "table-configured", \ +} + +/** + * Switch to use the packet's source or destination address for lookup + */ +typedef enum lookup_cast_t_ { + LOOKUP_UNICAST, + LOOKUP_MULTICAST, +} __attribute__ ((packed)) lookup_cast_t; + +#define LOOKUP_CASTS { \ + [LOOKUP_UNICAST] = "unicast", \ + [LOOKUP_MULTICAST] = "multicast", \ +} + +/** + * A representation of an MPLS label for imposition in the data-path + */ +typedef struct lookup_dpo_t +{ + /** + * The FIB, or interface from which to get a FIB, in which to perform + * the next lookup; + */ + fib_node_index_t lkd_fib_index; + + /** + * The protocol of the FIB for the lookup, and hence + * the protocol of the packet + */ + dpo_proto_t lkd_proto; + + /** + * Switch to use src or dst address + */ + lookup_input_t lkd_input; + + /** + * Switch to use the table index passed, or the table of the input interface + */ + lookup_table_t lkd_table; + + /** + * Unicast of rmulticast FIB lookup + */ + lookup_cast_t lkd_cast; + + /** + * Number of locks + */ + u16 lkd_locks; +} lookup_dpo_t; + +extern void lookup_dpo_add_or_lock_w_fib_index(fib_node_index_t fib_index, + dpo_proto_t proto, + lookup_cast_t cast, + lookup_input_t input, + lookup_table_t table, + dpo_id_t *dpo); +extern void lookup_dpo_add_or_lock_w_table_id(u32 table_id, + dpo_proto_t proto, + lookup_cast_t cast, + lookup_input_t input, + lookup_table_t table, + dpo_id_t *dpo); + +extern u8* format_lookup_dpo(u8 *s, va_list *args); + +/* + * Encapsulation violation for fast data-path access + */ +extern lookup_dpo_t *lookup_dpo_pool; + +static inline lookup_dpo_t * +lookup_dpo_get (index_t index) +{ + return (pool_elt_at_index(lookup_dpo_pool, index)); +} + +extern void lookup_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/mpls_disposition.c b/src/vnet/dpo/mpls_disposition.c new file mode 100644 index 00000000..5dc33fcf --- /dev/null +++ b/src/vnet/dpo/mpls_disposition.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/ip.h> +#include <vnet/dpo/mpls_disposition.h> +#include <vnet/mpls/mpls.h> + +/* + * pool of all MPLS Label DPOs + */ +mpls_disp_dpo_t *mpls_disp_dpo_pool; + +static mpls_disp_dpo_t * +mpls_disp_dpo_alloc (void) +{ + mpls_disp_dpo_t *mdd; + + pool_get_aligned(mpls_disp_dpo_pool, mdd, CLIB_CACHE_LINE_BYTES); + memset(mdd, 0, sizeof(*mdd)); + + dpo_reset(&mdd->mdd_dpo); + + return (mdd); +} + +static index_t +mpls_disp_dpo_get_index (mpls_disp_dpo_t *mdd) +{ + return (mdd - mpls_disp_dpo_pool); +} + +index_t +mpls_disp_dpo_create (dpo_proto_t payload_proto, + fib_rpf_id_t rpf_id, + const dpo_id_t *dpo) +{ + mpls_disp_dpo_t *mdd; + + mdd = mpls_disp_dpo_alloc(); + + mdd->mdd_payload_proto = payload_proto; + mdd->mdd_rpf_id = rpf_id; + + dpo_stack(DPO_MPLS_DISPOSITION, + mdd->mdd_payload_proto, + &mdd->mdd_dpo, + dpo); + + return (mpls_disp_dpo_get_index(mdd)); +} + +u8* +format_mpls_disp_dpo (u8 *s, va_list *args) +{ + index_t index = va_arg (*args, index_t); + u32 indent = va_arg (*args, u32); + mpls_disp_dpo_t *mdd; + + mdd = mpls_disp_dpo_get(index); + + s = format(s, "mpls-disposition:[%d]:[%U]", + index, + format_dpo_proto, mdd->mdd_payload_proto); + + s = format(s, "\n%U", format_white_space, indent); + s = format(s, "%U", format_dpo_id, &mdd->mdd_dpo, indent+2); + + return (s); +} + +static void +mpls_disp_dpo_lock (dpo_id_t *dpo) +{ + mpls_disp_dpo_t *mdd; + + mdd = mpls_disp_dpo_get(dpo->dpoi_index); + + mdd->mdd_locks++; +} + +static void +mpls_disp_dpo_unlock (dpo_id_t *dpo) +{ + mpls_disp_dpo_t *mdd; + + mdd = mpls_disp_dpo_get(dpo->dpoi_index); + + mdd->mdd_locks--; + + if (0 == mdd->mdd_locks) + { + dpo_reset(&mdd->mdd_dpo); + pool_put(mpls_disp_dpo_pool, mdd); + } +} + +/** + * @brief A struct to hold tracing information for the MPLS label disposition + * node. + */ +typedef struct mpls_label_disposition_trace_t_ +{ + index_t mdd; +} mpls_label_disposition_trace_t; + +always_inline uword +mpls_label_disposition_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + u8 payload_is_ip4, + u8 payload_is_ip6) +{ + u32 n_left_from, next_index, * from, * to_next; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + mpls_disp_dpo_t *mdd0, *mdd1; + u32 bi0, mddi0, bi1, mddi1; + vlib_buffer_t * b0, *b1; + u32 next0, next1; + + bi0 = to_next[0] = from[0]; + bi1 = to_next[1] = from[1]; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, STORE); + vlib_prefetch_buffer_header (p3, STORE); + + CLIB_PREFETCH (p2->data, sizeof (ip6_header_t), STORE); + CLIB_PREFETCH (p3->data, sizeof (ip6_header_t), STORE); + } + + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* dst lookup was done by ip4 lookup */ + mddi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + mddi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + mdd0 = mpls_disp_dpo_get(mddi0); + mdd1 = mpls_disp_dpo_get(mddi1); + + if (payload_is_ip4) + { + /* + * decrement the TTL on ingress to the LSP + */ + } + else if (payload_is_ip6) + { + /* + * decrement the TTL on ingress to the LSP + */ + } + + next0 = mdd0->mdd_dpo.dpoi_next_node; + next1 = mdd1->mdd_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mdd0->mdd_dpo.dpoi_index; + vnet_buffer(b1)->ip.adj_index[VLIB_TX] = mdd1->mdd_dpo.dpoi_index; + vnet_buffer(b0)->ip.rpf_id = mdd0->mdd_rpf_id; + vnet_buffer(b1)->ip.rpf_id = mdd1->mdd_rpf_id; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_disposition_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + + tr->mdd = mddi0; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_disposition_trace_t *tr = + vlib_add_trace (vm, node, b1, sizeof (*tr)); + tr->mdd = mddi1; + } + + vlib_validate_buffer_enqueue_x2(vm, node, next_index, to_next, + n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + mpls_disp_dpo_t *mdd0; + vlib_buffer_t * b0; + u32 bi0, mddi0; + u32 next0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* dst lookup was done by ip4 lookup */ + mddi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + mdd0 = mpls_disp_dpo_get(mddi0); + + if (payload_is_ip4) + { + /* + * decrement the TTL on ingress to the LSP + */ + } + else if (payload_is_ip6) + { + /* + * decrement the TTL on ingress to the LSP + */ + } + else + { + } + + next0 = mdd0->mdd_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mdd0->mdd_dpo.dpoi_index; + vnet_buffer(b0)->ip.rpf_id = mdd0->mdd_rpf_id; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_disposition_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->mdd = mddi0; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_mpls_label_disposition_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + CLIB_UNUSED (mpls_label_disposition_trace_t * t); + + t = va_arg (*args, mpls_label_disposition_trace_t *); + + s = format(s, "disp:%d", t->mdd); + return (s); +} + +static uword +ip4_mpls_label_disposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_disposition_inline(vm, node, frame, 1, 0)); +} + +VLIB_REGISTER_NODE (ip4_mpls_label_disposition_node) = { + .function = ip4_mpls_label_disposition, + .name = "ip4-mpls-label-disposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_disposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "ip4-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (ip4_mpls_label_disposition_node, + ip4_mpls_label_disposition) + +static uword +ip6_mpls_label_disposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_disposition_inline(vm, node, frame, 0, 1)); +} + +VLIB_REGISTER_NODE (ip6_mpls_label_disposition_node) = { + .function = ip6_mpls_label_disposition, + .name = "ip6-mpls-label-disposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_disposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "ip6-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (ip6_mpls_label_disposition_node, + ip6_mpls_label_disposition) + +static void +mpls_disp_dpo_mem_show (void) +{ + fib_show_memory_usage("MPLS label", + pool_elts(mpls_disp_dpo_pool), + pool_len(mpls_disp_dpo_pool), + sizeof(mpls_disp_dpo_t)); +} + +const static dpo_vft_t mdd_vft = { + .dv_lock = mpls_disp_dpo_lock, + .dv_unlock = mpls_disp_dpo_unlock, + .dv_format = format_mpls_disp_dpo, + .dv_mem_show = mpls_disp_dpo_mem_show, +}; + +const static char* const mpls_label_disp_ip4_nodes[] = +{ + "ip4-mpls-label-disposition", + NULL, +}; +const static char* const mpls_label_disp_ip6_nodes[] = +{ + "ip6-mpls-label-disposition", + NULL, +}; +const static char* const * const mpls_label_disp_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = mpls_label_disp_ip4_nodes, + [DPO_PROTO_IP6] = mpls_label_disp_ip6_nodes, +}; + + +void +mpls_disp_dpo_module_init (void) +{ + dpo_register(DPO_MPLS_DISPOSITION, &mdd_vft, mpls_label_disp_nodes); +} diff --git a/src/vnet/dpo/mpls_disposition.h b/src/vnet/dpo/mpls_disposition.h new file mode 100644 index 00000000..9c015083 --- /dev/null +++ b/src/vnet/dpo/mpls_disposition.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __MPLS_DISP_DPO_H__ +#define __MPLS_DISP_DPO_H__ + +#include <vnet/vnet.h> +#include <vnet/mpls/packet.h> +#include <vnet/dpo/dpo.h> +#include <vnet/mfib/mfib_types.h> + +/** + * A representation of an MPLS label for imposition in the data-path + */ +typedef struct mpls_disp_dpo_t +{ + /** + * Next DPO in the graph + */ + dpo_id_t mdd_dpo; + + /** + * The protocol of the payload/packets that are being encapped + */ + dpo_proto_t mdd_payload_proto; + + /** + * RPF-ID (if this is an mcast disposition) + */ + fib_rpf_id_t mdd_rpf_id; + + /** + * Number of locks/users of the label + */ + u16 mdd_locks; +} mpls_disp_dpo_t; + +/** + * @brief Assert that the MPLS label object is less than a cache line in size. + * Should this get any bigger then we will need to reconsider how many labels + * can be pushed in one object. + */ +_Static_assert((sizeof(mpls_disp_dpo_t) <= CLIB_CACHE_LINE_BYTES), + "MPLS Disposition DPO is larger than one cache line."); + +/** + * @brief Create an MPLS label object + * + * @param payload_proto The ptocool of the payload packets that will + * be imposed with this label header. + * @param dpo The parent of the created MPLS label object + */ +extern index_t mpls_disp_dpo_create(dpo_proto_t payload_proto, + fib_rpf_id_t rpf_id, + const dpo_id_t *dpo); + +extern u8* format_mpls_disp_dpo(u8 *s, va_list *args); + + +/* + * Encapsulation violation for fast data-path access + */ +extern mpls_disp_dpo_t *mpls_disp_dpo_pool; + +static inline mpls_disp_dpo_t * +mpls_disp_dpo_get (index_t index) +{ + return (pool_elt_at_index(mpls_disp_dpo_pool, index)); +} + +extern void mpls_disp_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/mpls_label_dpo.c b/src/vnet/dpo/mpls_label_dpo.c new file mode 100644 index 00000000..2a6e7dd5 --- /dev/null +++ b/src/vnet/dpo/mpls_label_dpo.c @@ -0,0 +1,703 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/ip.h> +#include <vnet/dpo/mpls_label_dpo.h> +#include <vnet/mpls/mpls.h> + +/* + * pool of all MPLS Label DPOs + */ +mpls_label_dpo_t *mpls_label_dpo_pool; + +static mpls_label_dpo_t * +mpls_label_dpo_alloc (void) +{ + mpls_label_dpo_t *mld; + + pool_get_aligned(mpls_label_dpo_pool, mld, CLIB_CACHE_LINE_BYTES); + memset(mld, 0, sizeof(*mld)); + + dpo_reset(&mld->mld_dpo); + + return (mld); +} + +static index_t +mpls_label_dpo_get_index (mpls_label_dpo_t *mld) +{ + return (mld - mpls_label_dpo_pool); +} + +index_t +mpls_label_dpo_create (mpls_label_t *label_stack, + mpls_eos_bit_t eos, + u8 ttl, + u8 exp, + dpo_proto_t payload_proto, + const dpo_id_t *dpo) +{ + mpls_label_dpo_t *mld; + u32 ii; + + mld = mpls_label_dpo_alloc(); + mld->mld_n_labels = vec_len(label_stack); + mld->mld_n_hdr_bytes = mld->mld_n_labels * sizeof(mld->mld_hdr[0]); + mld->mld_payload_proto = payload_proto; + + /* + * construct label rewrite headers for each value value passed. + * get the header in network byte order since we will paint it + * on a packet in the data-plane + */ + + for (ii = 0; ii < mld->mld_n_labels-1; ii++) + { + vnet_mpls_uc_set_label(&mld->mld_hdr[ii].label_exp_s_ttl, label_stack[ii]); + vnet_mpls_uc_set_ttl(&mld->mld_hdr[ii].label_exp_s_ttl, 255); + vnet_mpls_uc_set_exp(&mld->mld_hdr[ii].label_exp_s_ttl, 0); + vnet_mpls_uc_set_s(&mld->mld_hdr[ii].label_exp_s_ttl, MPLS_NON_EOS); + mld->mld_hdr[ii].label_exp_s_ttl = + clib_host_to_net_u32(mld->mld_hdr[ii].label_exp_s_ttl); + } + + /* + * the inner most label + */ + ii = mld->mld_n_labels-1; + + vnet_mpls_uc_set_label(&mld->mld_hdr[ii].label_exp_s_ttl, label_stack[ii]); + vnet_mpls_uc_set_ttl(&mld->mld_hdr[ii].label_exp_s_ttl, ttl); + vnet_mpls_uc_set_exp(&mld->mld_hdr[ii].label_exp_s_ttl, exp); + vnet_mpls_uc_set_s(&mld->mld_hdr[ii].label_exp_s_ttl, eos); + mld->mld_hdr[ii].label_exp_s_ttl = + clib_host_to_net_u32(mld->mld_hdr[ii].label_exp_s_ttl); + + /* + * stack this label objct on its parent. + */ + dpo_stack(DPO_MPLS_LABEL, + mld->mld_payload_proto, + &mld->mld_dpo, + dpo); + + return (mpls_label_dpo_get_index(mld)); +} + +u8* +format_mpls_label_dpo (u8 *s, va_list *args) +{ + index_t index = va_arg (*args, index_t); + u32 indent = va_arg (*args, u32); + mpls_unicast_header_t hdr; + mpls_label_dpo_t *mld; + u32 ii; + + s = format(s, "mpls-label:[%d]:", index); + + if (pool_is_free_index(mpls_label_dpo_pool, index)) + { + /* + * the packet trace can be printed after the DPO has been deleted + */ + return (s); + } + + mld = mpls_label_dpo_get(index); + + for (ii = 0; ii < mld->mld_n_labels; ii++) + { + hdr.label_exp_s_ttl = + clib_net_to_host_u32(mld->mld_hdr[ii].label_exp_s_ttl); + s = format(s, "%U", format_mpls_header, hdr); + } + + s = format(s, "\n%U", format_white_space, indent); + s = format(s, "%U", format_dpo_id, &mld->mld_dpo, indent+2); + + return (s); +} + +static void +mpls_label_dpo_lock (dpo_id_t *dpo) +{ + mpls_label_dpo_t *mld; + + mld = mpls_label_dpo_get(dpo->dpoi_index); + + mld->mld_locks++; +} + +static void +mpls_label_dpo_unlock (dpo_id_t *dpo) +{ + mpls_label_dpo_t *mld; + + mld = mpls_label_dpo_get(dpo->dpoi_index); + + mld->mld_locks--; + + if (0 == mld->mld_locks) + { + dpo_reset(&mld->mld_dpo); + pool_put(mpls_label_dpo_pool, mld); + } +} + +/** + * @brief A struct to hold tracing information for the MPLS label imposition + * node. + */ +typedef struct mpls_label_imposition_trace_t_ +{ + /** + * The MPLS header imposed + */ + mpls_unicast_header_t hdr; +} mpls_label_imposition_trace_t; + +always_inline mpls_unicast_header_t * +mpls_label_paint (vlib_buffer_t * b0, + mpls_label_dpo_t *mld0, + u8 ttl0) +{ + mpls_unicast_header_t *hdr0; + + vlib_buffer_advance(b0, -(mld0->mld_n_hdr_bytes)); + + hdr0 = vlib_buffer_get_current(b0); + + if (1 == mld0->mld_n_labels) + { + /* optimise for the common case of one label */ + *hdr0 = mld0->mld_hdr[0]; + } + else + { + clib_memcpy(hdr0, mld0->mld_hdr, mld0->mld_n_hdr_bytes); + hdr0 = hdr0 + (mld0->mld_n_labels - 1); + } + /* fixup the TTL for the inner most label */ + ((char*)hdr0)[3] = ttl0; + + return (hdr0); +} + +always_inline uword +mpls_label_imposition_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + u8 payload_is_ip4, + u8 payload_is_ip6, + u8 payload_is_ethernet) +{ + u32 n_left_from, next_index, * from, * to_next; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 8 && n_left_to_next >= 4) + { + u32 bi0, mldi0, bi1, mldi1, bi2, mldi2, bi3, mldi3; + mpls_unicast_header_t *hdr0, *hdr1, *hdr2, *hdr3; + mpls_label_dpo_t *mld0, *mld1, *mld2, *mld3; + vlib_buffer_t * b0, *b1, * b2, *b3; + u32 next0, next1, next2, next3; + u8 ttl0, ttl1,ttl2, ttl3 ; + + bi0 = to_next[0] = from[0]; + bi1 = to_next[1] = from[1]; + bi2 = to_next[2] = from[2]; + bi3 = to_next[3] = from[3]; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3, *p4, *p5; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + p4 = vlib_get_buffer (vm, from[4]); + p5 = vlib_get_buffer (vm, from[5]); + + vlib_prefetch_buffer_header (p2, STORE); + vlib_prefetch_buffer_header (p3, STORE); + vlib_prefetch_buffer_header (p4, STORE); + vlib_prefetch_buffer_header (p5, STORE); + + CLIB_PREFETCH (p2->data, sizeof (hdr0[0]), STORE); + CLIB_PREFETCH (p3->data, sizeof (hdr0[0]), STORE); + CLIB_PREFETCH (p4->data, sizeof (hdr0[0]), STORE); + CLIB_PREFETCH (p5->data, sizeof (hdr0[0]), STORE); + } + + from += 4; + to_next += 4; + n_left_from -= 4; + n_left_to_next -= 4; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + b2 = vlib_get_buffer (vm, bi2); + b3 = vlib_get_buffer (vm, bi3); + + /* dst lookup was done by ip4 lookup */ + mldi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + mldi1 = vnet_buffer(b1)->ip.adj_index[VLIB_TX]; + mldi2 = vnet_buffer(b2)->ip.adj_index[VLIB_TX]; + mldi3 = vnet_buffer(b3)->ip.adj_index[VLIB_TX]; + mld0 = mpls_label_dpo_get(mldi0); + mld1 = mpls_label_dpo_get(mldi1); + mld2 = mpls_label_dpo_get(mldi2); + mld3 = mpls_label_dpo_get(mldi3); + + if (payload_is_ip4) + { + /* + * decrement the TTL on ingress to the LSP + */ + ip4_header_t * ip0 = vlib_buffer_get_current(b0); + ip4_header_t * ip1 = vlib_buffer_get_current(b1); + ip4_header_t * ip2 = vlib_buffer_get_current(b2); + ip4_header_t * ip3 = vlib_buffer_get_current(b3); + u32 checksum0; + u32 checksum1; + u32 checksum2; + u32 checksum3; + + checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100); + checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100); + checksum2 = ip2->checksum + clib_host_to_net_u16 (0x0100); + checksum3 = ip3->checksum + clib_host_to_net_u16 (0x0100); + + checksum0 += checksum0 >= 0xffff; + checksum1 += checksum1 >= 0xffff; + checksum2 += checksum2 >= 0xffff; + checksum3 += checksum3 >= 0xffff; + + ip0->checksum = checksum0; + ip1->checksum = checksum1; + ip2->checksum = checksum2; + ip3->checksum = checksum3; + + ip0->ttl -= 1; + ip1->ttl -= 1; + ip2->ttl -= 1; + ip3->ttl -= 1; + + ttl1 = ip1->ttl; + ttl0 = ip0->ttl; + ttl3 = ip3->ttl; + ttl2 = ip2->ttl; + } + else if (payload_is_ip6) + { + /* + * decrement the TTL on ingress to the LSP + */ + ip6_header_t * ip0 = vlib_buffer_get_current(b0); + ip6_header_t * ip1 = vlib_buffer_get_current(b1); + ip6_header_t * ip2 = vlib_buffer_get_current(b2); + ip6_header_t * ip3 = vlib_buffer_get_current(b3); + + ip0->hop_limit -= 1; + ip1->hop_limit -= 1; + ip2->hop_limit -= 1; + ip3->hop_limit -= 1; + + ttl0 = ip0->hop_limit; + ttl1 = ip1->hop_limit; + ttl2 = ip2->hop_limit; + ttl3 = ip3->hop_limit; + } + else if (payload_is_ethernet) + { + /* + * nothing to chang ein the ethernet header + */ + ttl0 = ttl1 = ttl2 = ttl3 = 255; + } + else + { + /* + * else, the packet to be encapped is an MPLS packet + */ + if (PREDICT_TRUE(vnet_buffer(b0)->mpls.first)) + { + /* + * The first label to be imposed on the packet. this is a label swap. + * in which case we stashed the TTL and EXP bits in the + * packet in the lookup node + */ + ASSERT(0 != vnet_buffer (b0)->mpls.ttl); + + ttl0 = vnet_buffer(b0)->mpls.ttl - 1; + } + else + { + /* + * not the first label. implying we are recusring down a chain of + * output labels. + * Each layer is considered a new LSP - hence the TTL is reset. + */ + ttl0 = 255; + } + if (PREDICT_TRUE(vnet_buffer(b1)->mpls.first)) + { + ASSERT(1 != vnet_buffer (b1)->mpls.ttl); + ttl1 = vnet_buffer(b1)->mpls.ttl - 1; + } + else + { + ttl1 = 255; + } + if (PREDICT_TRUE(vnet_buffer(b2)->mpls.first)) + { + ASSERT(1 != vnet_buffer (b2)->mpls.ttl); + + ttl2 = vnet_buffer(b2)->mpls.ttl - 1; + } + else + { + ttl2 = 255; + } + if (PREDICT_TRUE(vnet_buffer(b3)->mpls.first)) + { + ASSERT(1 != vnet_buffer (b3)->mpls.ttl); + ttl3 = vnet_buffer(b3)->mpls.ttl - 1; + } + else + { + ttl3 = 255; + } + } + vnet_buffer(b0)->mpls.first = 0; + vnet_buffer(b1)->mpls.first = 0; + vnet_buffer(b2)->mpls.first = 0; + vnet_buffer(b3)->mpls.first = 0; + + /* Paint the MPLS header */ + hdr0 = mpls_label_paint(b0, mld0, ttl0); + hdr1 = mpls_label_paint(b1, mld1, ttl1); + hdr2 = mpls_label_paint(b2, mld2, ttl2); + hdr3 = mpls_label_paint(b3, mld3, ttl3); + + next0 = mld0->mld_dpo.dpoi_next_node; + next1 = mld1->mld_dpo.dpoi_next_node; + next2 = mld2->mld_dpo.dpoi_next_node; + next3 = mld3->mld_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mld0->mld_dpo.dpoi_index; + vnet_buffer(b1)->ip.adj_index[VLIB_TX] = mld1->mld_dpo.dpoi_index; + vnet_buffer(b2)->ip.adj_index[VLIB_TX] = mld2->mld_dpo.dpoi_index; + vnet_buffer(b3)->ip.adj_index[VLIB_TX] = mld3->mld_dpo.dpoi_index; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_imposition_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->hdr = *hdr0; + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_imposition_trace_t *tr = + vlib_add_trace (vm, node, b1, sizeof (*tr)); + tr->hdr = *hdr1; + } + if (PREDICT_FALSE(b2->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_imposition_trace_t *tr = + vlib_add_trace (vm, node, b2, sizeof (*tr)); + tr->hdr = *hdr2; + } + if (PREDICT_FALSE(b3->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_imposition_trace_t *tr = + vlib_add_trace (vm, node, b3, sizeof (*tr)); + tr->hdr = *hdr3; + } + + vlib_validate_buffer_enqueue_x4(vm, node, next_index, to_next, + n_left_to_next, + bi0, bi1, bi2, bi3, + next0, next1, next2, next3); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + mpls_unicast_header_t *hdr0; + mpls_label_dpo_t *mld0; + vlib_buffer_t * b0; + u32 bi0, mldi0; + u32 next0; + u8 ttl; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* dst lookup was done by ip4 lookup */ + mldi0 = vnet_buffer(b0)->ip.adj_index[VLIB_TX]; + mld0 = mpls_label_dpo_get(mldi0); + + if (payload_is_ip4) + { + /* + * decrement the TTL on ingress to the LSP + */ + ip4_header_t * ip0 = vlib_buffer_get_current(b0); + u32 checksum0; + + checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100); + checksum0 += checksum0 >= 0xffff; + + ip0->checksum = checksum0; + ip0->ttl -= 1; + ttl = ip0->ttl; + } + else if (payload_is_ip6) + { + /* + * decrement the TTL on ingress to the LSP + */ + ip6_header_t * ip0 = vlib_buffer_get_current(b0); + + ip0->hop_limit -= 1; + ttl = ip0->hop_limit; + } + else + { + /* + * else, the packet to be encapped is an MPLS packet + */ + if (vnet_buffer(b0)->mpls.first) + { + /* + * The first label to be imposed on the packet. this is a label swap. + * in which case we stashed the TTL and EXP bits in the + * packet in the lookup node + */ + ASSERT(0 != vnet_buffer (b0)->mpls.ttl); + + ttl = vnet_buffer(b0)->mpls.ttl - 1; + } + else + { + /* + * not the first label. implying we are recusring down a chain of + * output labels. + * Each layer is considered a new LSP - hence the TTL is reset. + */ + ttl = 255; + } + } + vnet_buffer(b0)->mpls.first = 0; + + /* Paint the MPLS header */ + vlib_buffer_advance(b0, -(mld0->mld_n_hdr_bytes)); + hdr0 = vlib_buffer_get_current(b0); + clib_memcpy(hdr0, mld0->mld_hdr, mld0->mld_n_hdr_bytes); + + /* fixup the TTL for the inner most label */ + hdr0 = hdr0 + (mld0->mld_n_labels - 1); + ((char*)hdr0)[3] = ttl; + + next0 = mld0->mld_dpo.dpoi_next_node; + vnet_buffer(b0)->ip.adj_index[VLIB_TX] = mld0->mld_dpo.dpoi_index; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + mpls_label_imposition_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->hdr = *hdr0; + } + + vlib_validate_buffer_enqueue_x1(vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static u8 * +format_mpls_label_imposition_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + mpls_label_imposition_trace_t * t; + mpls_unicast_header_t hdr; + uword indent; + + t = va_arg (*args, mpls_label_imposition_trace_t *); + indent = format_get_indent (s); + hdr.label_exp_s_ttl = clib_net_to_host_u32(t->hdr.label_exp_s_ttl); + + s = format (s, "%Umpls-header:%U", + format_white_space, indent, + format_mpls_header, hdr); + return (s); +} + +static uword +mpls_label_imposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_imposition_inline(vm, node, frame, 0, 0, 0)); +} + +VLIB_REGISTER_NODE (mpls_label_imposition_node) = { + .function = mpls_label_imposition, + .name = "mpls-label-imposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_imposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "mpls-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (mpls_label_imposition_node, + mpls_label_imposition) + +static uword +ip4_mpls_label_imposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_imposition_inline(vm, node, frame, 1, 0, 0)); +} + +VLIB_REGISTER_NODE (ip4_mpls_label_imposition_node) = { + .function = ip4_mpls_label_imposition, + .name = "ip4-mpls-label-imposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_imposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "ip4-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (ip4_mpls_label_imposition_node, + ip4_mpls_label_imposition) + +static uword +ip6_mpls_label_imposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_imposition_inline(vm, node, frame, 0, 1, 0)); +} + +VLIB_REGISTER_NODE (ip6_mpls_label_imposition_node) = { + .function = ip6_mpls_label_imposition, + .name = "ip6-mpls-label-imposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_imposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "ip6-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (ip6_mpls_label_imposition_node, + ip6_mpls_label_imposition) + +static uword +ethernet_mpls_label_imposition (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (mpls_label_imposition_inline(vm, node, frame, 0, 0, 1)); +} + +VLIB_REGISTER_NODE (ethernet_mpls_label_imposition_node) = { + .function = ethernet_mpls_label_imposition, + .name = "ethernet-mpls-label-imposition", + .vector_size = sizeof (u32), + + .format_trace = format_mpls_label_imposition_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + } +}; +VLIB_NODE_FUNCTION_MULTIARCH (ethernet_mpls_label_imposition_node, + ethernet_mpls_label_imposition) + +static void +mpls_label_dpo_mem_show (void) +{ + fib_show_memory_usage("MPLS label", + pool_elts(mpls_label_dpo_pool), + pool_len(mpls_label_dpo_pool), + sizeof(mpls_label_dpo_t)); +} + +const static dpo_vft_t mld_vft = { + .dv_lock = mpls_label_dpo_lock, + .dv_unlock = mpls_label_dpo_unlock, + .dv_format = format_mpls_label_dpo, + .dv_mem_show = mpls_label_dpo_mem_show, +}; + +const static char* const mpls_label_imp_ip4_nodes[] = +{ + "ip4-mpls-label-imposition", + NULL, +}; +const static char* const mpls_label_imp_ip6_nodes[] = +{ + "ip6-mpls-label-imposition", + NULL, +}; +const static char* const mpls_label_imp_mpls_nodes[] = +{ + "mpls-label-imposition", + NULL, +}; +const static char* const mpls_label_imp_ethernet_nodes[] = +{ + "ethernet-mpls-label-imposition", + NULL, +}; + +const static char* const * const mpls_label_imp_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = mpls_label_imp_ip4_nodes, + [DPO_PROTO_IP6] = mpls_label_imp_ip6_nodes, + [DPO_PROTO_MPLS] = mpls_label_imp_mpls_nodes, + [DPO_PROTO_ETHERNET] = mpls_label_imp_ethernet_nodes, +}; + + +void +mpls_label_dpo_module_init (void) +{ + dpo_register(DPO_MPLS_LABEL, &mld_vft, mpls_label_imp_nodes); +} diff --git a/src/vnet/dpo/mpls_label_dpo.h b/src/vnet/dpo/mpls_label_dpo.h new file mode 100644 index 00000000..e23f3d26 --- /dev/null +++ b/src/vnet/dpo/mpls_label_dpo.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __MPLS_LABEL_DPO_H__ +#define __MPLS_LABEL_DPO_H__ + +#include <vnet/vnet.h> +#include <vnet/mpls/packet.h> +#include <vnet/dpo/dpo.h> + +/** + * A representation of an MPLS label for imposition in the data-path + */ +typedef struct mpls_label_dpo_t +{ + /** + * The MPLS label header to impose. Outer most label first. + */ + mpls_unicast_header_t mld_hdr[8]; + + /** + * Next DPO in the graph + */ + dpo_id_t mld_dpo; + + /** + * The protocol of the payload/packets that are being encapped + */ + dpo_proto_t mld_payload_proto; + + /** + * Size of the label stack + */ + u16 mld_n_labels; + + /** + * Cached amount of header bytes to paint + */ + u16 mld_n_hdr_bytes; + + /** + * Number of locks/users of the label + */ + u16 mld_locks; +} mpls_label_dpo_t; + +/** + * @brief Assert that the MPLS label object is less than a cache line in size. + * Should this get any bigger then we will need to reconsider how many labels + * can be pushed in one object. + */ +STATIC_ASSERT((sizeof(mpls_label_dpo_t) <= CLIB_CACHE_LINE_BYTES), + "MPLS label DPO is larger than one cache line."); + +/** + * @brief Create an MPLS label object + * + * @param label_stack The stack if labels to impose, outer most label first + * @param eos The inner most label's EOS bit + * @param ttl The inner most label's TTL bit + * @param exp The inner most label's EXP bit + * @param payload_proto The ptocool of the payload packets that will + * be imposed with this label header. + * @param dpo The parent of the created MPLS label object + */ +extern index_t mpls_label_dpo_create(mpls_label_t *label_stack, + mpls_eos_bit_t eos, + u8 ttl, + u8 exp, + dpo_proto_t payload_proto, + const dpo_id_t *dpo); + +extern u8* format_mpls_label_dpo(u8 *s, va_list *args); + + +/* + * Encapsulation violation for fast data-path access + */ +extern mpls_label_dpo_t *mpls_label_dpo_pool; + +static inline mpls_label_dpo_t * +mpls_label_dpo_get (index_t index) +{ + return (pool_elt_at_index(mpls_label_dpo_pool, index)); +} + +extern void mpls_label_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/punt_dpo.c b/src/vnet/dpo/punt_dpo.c new file mode 100644 index 00000000..d1661dcc --- /dev/null +++ b/src/vnet/dpo/punt_dpo.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing puntping the packet + */ + +#include <vnet/dpo/dpo.h> + +static dpo_id_t punt_dpos[DPO_PROTO_NUM]; + +const dpo_id_t * +punt_dpo_get (dpo_proto_t proto) +{ + dpo_set(&punt_dpos[proto], DPO_PUNT, proto, 1); + + return (&punt_dpos[proto]); +} + +int +dpo_is_punt (const dpo_id_t *dpo) +{ + return (dpo->dpoi_type == DPO_PUNT); +} + +static void +punt_dpo_lock (dpo_id_t *dpo) +{ + /* + * not maintaining a lock count on the punt + * more trouble than it's worth. + * There always needs to be one around. no point it managaing its lifetime + */ +} +static void +punt_dpo_unlock (dpo_id_t *dpo) +{ +} + +static u8* +format_punt_dpo (u8 *s, va_list *ap) +{ + CLIB_UNUSED(index_t index) = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + + return (format(s, "dpo-punt")); +} + +const static dpo_vft_t punt_vft = { + .dv_lock = punt_dpo_lock, + .dv_unlock = punt_dpo_unlock, + .dv_format = format_punt_dpo, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a punt + * object. + * + * this means that these graph nodes are ones from which a punt is the + * parent object in the DPO-graph. + */ +const static char* const punt_ip4_nodes[] = +{ + "ip4-punt", + NULL, +}; +const static char* const punt_ip6_nodes[] = +{ + "ip6-punt", + NULL, +}; +const static char* const punt_mpls_nodes[] = +{ + "mpls-punt", + NULL, +}; +const static char* const * const punt_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = punt_ip4_nodes, + [DPO_PROTO_IP6] = punt_ip6_nodes, + [DPO_PROTO_MPLS] = punt_mpls_nodes, +}; + +void +punt_dpo_module_init (void) +{ + dpo_register(DPO_PUNT, &punt_vft, punt_nodes); +} diff --git a/src/vnet/dpo/punt_dpo.h b/src/vnet/dpo/punt_dpo.h new file mode 100644 index 00000000..370547c1 --- /dev/null +++ b/src/vnet/dpo/punt_dpo.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief A DPO to punt packets to the Control-plane + */ + +#ifndef __PUNT_DPO_H__ +#define __PUNT_DPO_H__ + +#include <vnet/dpo/dpo.h> + +extern int dpo_is_punt(const dpo_id_t *dpo); + +extern const dpo_id_t *punt_dpo_get(dpo_proto_t proto); + +extern void punt_dpo_module_init(void); + +#endif diff --git a/src/vnet/dpo/receive_dpo.c b/src/vnet/dpo/receive_dpo.c new file mode 100644 index 00000000..83e33ed8 --- /dev/null +++ b/src/vnet/dpo/receive_dpo.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing receiveing the packet, i.e. it's for-us + */ +#include <vlib/vlib.h> +#include <vnet/ip/ip.h> +#include <vnet/dpo/receive_dpo.h> + +/** + * @brief pool of all receive DPOs + */ +receive_dpo_t *receive_dpo_pool; + +static receive_dpo_t * +receive_dpo_alloc (void) +{ + receive_dpo_t *rd; + + pool_get_aligned(receive_dpo_pool, rd, CLIB_CACHE_LINE_BYTES); + memset(rd, 0, sizeof(*rd)); + + return (rd); +} + +static receive_dpo_t * +receive_dpo_get_from_dpo (const dpo_id_t *dpo) +{ + ASSERT(DPO_RECEIVE == dpo->dpoi_type); + + return (receive_dpo_get(dpo->dpoi_index)); +} + + +/* + * receive_dpo_add_or_lock + * + * The next_hop address here is used for source address selection in the DP. + * The local adj is added to an interface's receive prefix, the next-hop + * passed here is the local prefix on the same interface. + */ +void +receive_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + const ip46_address_t *nh_addr, + dpo_id_t *dpo) +{ + receive_dpo_t *rd; + + rd = receive_dpo_alloc(); + + rd->rd_sw_if_index = sw_if_index; + if (NULL != nh_addr) + { + rd->rd_addr = *nh_addr; + } + + dpo_set(dpo, DPO_RECEIVE, proto, (rd - receive_dpo_pool)); +} + +static void +receive_dpo_lock (dpo_id_t *dpo) +{ + receive_dpo_t *rd; + + rd = receive_dpo_get_from_dpo(dpo); + rd->rd_locks++; +} + +static void +receive_dpo_unlock (dpo_id_t *dpo) +{ + receive_dpo_t *rd; + + rd = receive_dpo_get_from_dpo(dpo); + rd->rd_locks--; + + if (0 == rd->rd_locks) + { + pool_put(receive_dpo_pool, rd); + } +} + +static u8* +format_receive_dpo (u8 *s, va_list *ap) +{ + CLIB_UNUSED(index_t index) = va_arg(*ap, index_t); + CLIB_UNUSED(u32 indent) = va_arg(*ap, u32); + vnet_main_t * vnm = vnet_get_main(); + receive_dpo_t *rd; + + if (pool_is_free_index(receive_dpo_pool, index)) + { + return (format(s, "dpo-receive DELETED")); + } + + rd = receive_dpo_get(index); + + if (~0 != rd->rd_sw_if_index) + { + return (format(s, "dpo-receive: %U on %U", + format_ip46_address, &rd->rd_addr, IP46_TYPE_ANY, + format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface(vnm, rd->rd_sw_if_index))); + } + else + { + return (format(s, "dpo-receive")); + } +} + +static void +receive_dpo_mem_show (void) +{ + fib_show_memory_usage("Receive", + pool_elts(receive_dpo_pool), + pool_len(receive_dpo_pool), + sizeof(receive_dpo_t)); +} + +const static dpo_vft_t receive_vft = { + .dv_lock = receive_dpo_lock, + .dv_unlock = receive_dpo_unlock, + .dv_format = format_receive_dpo, + .dv_mem_show = receive_dpo_mem_show, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a receive + * object. + * + * this means that these graph nodes are ones from which a receive is the + * parent object in the DPO-graph. + */ +const static char* const receive_ip4_nodes[] = +{ + "ip4-local", + NULL, +}; +const static char* const receive_ip6_nodes[] = +{ + "ip6-local", + NULL, +}; + +const static char* const * const receive_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = receive_ip4_nodes, + [DPO_PROTO_IP6] = receive_ip6_nodes, + [DPO_PROTO_MPLS] = NULL, +}; + +void +receive_dpo_module_init (void) +{ + dpo_register(DPO_RECEIVE, &receive_vft, receive_nodes); +} diff --git a/src/vnet/dpo/receive_dpo.h b/src/vnet/dpo/receive_dpo.h new file mode 100644 index 00000000..2420fd78 --- /dev/null +++ b/src/vnet/dpo/receive_dpo.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * The data-path object representing receiveing the packet, i.e. it's for-us + */ + +#ifndef __RECEIVE_DPO_H__ +#define __RECEIVE_DPO_H__ + +#include <vnet/dpo/dpo.h> +#include <vnet/ip/ip6.h> + +typedef struct receive_dpo_t_ +{ + /** + * The Software interface index on which traffic is received + */ + u32 rd_sw_if_index; + + /** + * The address on the receive interface. packet are destined to this address + */ + ip46_address_t rd_addr; + + /** + * number oflocks. + */ + u16 rd_locks; +} receive_dpo_t; + +extern void receive_dpo_add_or_lock (dpo_proto_t proto, + u32 sw_if_index, + const ip46_address_t *nh_addr, + dpo_id_t *dpo); + +extern void receive_dpo_module_init(void); + +/** + * @brief pool of all receive DPOs + */ +receive_dpo_t *receive_dpo_pool; + +static inline receive_dpo_t * +receive_dpo_get (index_t index) +{ + return (pool_elt_at_index(receive_dpo_pool, index)); +} + +#endif diff --git a/src/vnet/dpo/replicate_dpo.c b/src/vnet/dpo/replicate_dpo.c new file mode 100644 index 00000000..9fdb9a05 --- /dev/null +++ b/src/vnet/dpo/replicate_dpo.c @@ -0,0 +1,821 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/lookup.h> +#include <vnet/dpo/replicate_dpo.h> +#include <vnet/dpo/drop_dpo.h> +#include <vnet/adj/adj.h> +#include <vnet/mpls/mpls_types.h> + +#undef REP_DEBUG + +#ifdef REP_DEBUG +#define REP_DBG(_rep, _fmt, _args...) \ +{ \ + u8* _tmp =NULL; \ + clib_warning("rep:[%s]:" _fmt, \ + replicate_format(replicate_get_index((_rep)), \ + 0, _tmp), \ + ##_args); \ + vec_free(_tmp); \ +} +#else +#define REP_DBG(_p, _fmt, _args...) +#endif + +#define foreach_replicate_dpo_error \ +_(BUFFER_ALLOCATION_FAILURE, "Buffer Allocation Failure") + +typedef enum { +#define _(sym,str) REPLICATE_DPO_ERROR_##sym, + foreach_replicate_dpo_error +#undef _ + REPLICATE_DPO_N_ERROR, +} replicate_dpo_error_t; + +static char * replicate_dpo_error_strings[] = { +#define _(sym,string) string, + foreach_replicate_dpo_error +#undef _ +}; + +/** + * Pool of all DPOs. It's not static so the DP can have fast access + */ +replicate_t *replicate_pool; + +/** + * The one instance of replicate main + */ +replicate_main_t replicate_main; + +static inline index_t +replicate_get_index (const replicate_t *rep) +{ + return (rep - replicate_pool); +} + +static inline dpo_id_t* +replicate_get_buckets (replicate_t *rep) +{ + if (REP_HAS_INLINE_BUCKETS(rep)) + { + return (rep->rep_buckets_inline); + } + else + { + return (rep->rep_buckets); + } +} + +static replicate_t * +replicate_alloc_i (void) +{ + replicate_t *rep; + + pool_get_aligned(replicate_pool, rep, CLIB_CACHE_LINE_BYTES); + memset(rep, 0, sizeof(*rep)); + + vlib_validate_combined_counter(&(replicate_main.repm_counters), + replicate_get_index(rep)); + vlib_zero_combined_counter(&(replicate_main.repm_counters), + replicate_get_index(rep)); + + return (rep); +} + +static u8* +replicate_format (index_t repi, + replicate_format_flags_t flags, + u32 indent, + u8 *s) +{ + vlib_counter_t to; + replicate_t *rep; + dpo_id_t *buckets; + u32 i; + + repi &= ~MPLS_IS_REPLICATE; + rep = replicate_get(repi); + vlib_get_combined_counter(&(replicate_main.repm_counters), repi, &to); + buckets = replicate_get_buckets(rep); + + s = format(s, "%U: ", format_dpo_type, DPO_REPLICATE); + s = format(s, "[index:%d buckets:%d ", repi, rep->rep_n_buckets); + s = format(s, "to:[%Ld:%Ld]]", to.packets, to.bytes); + + for (i = 0; i < rep->rep_n_buckets; i++) + { + s = format(s, "\n%U", format_white_space, indent+2); + s = format(s, "[%d]", i); + s = format(s, " %U", format_dpo_id, &buckets[i], indent+6); + } + return (s); +} + +u8* +format_replicate (u8 * s, va_list * args) +{ + index_t repi = va_arg(*args, index_t); + replicate_format_flags_t flags = va_arg(*args, replicate_format_flags_t); + + return (replicate_format(repi, flags, 0, s)); +} +static u8* +format_replicate_dpo (u8 * s, va_list * args) +{ + index_t repi = va_arg(*args, index_t); + u32 indent = va_arg(*args, u32); + + return (replicate_format(repi, REPLICATE_FORMAT_DETAIL, indent, s)); +} + + +static replicate_t * +replicate_create_i (u32 num_buckets, + dpo_proto_t rep_proto) +{ + replicate_t *rep; + + rep = replicate_alloc_i(); + rep->rep_n_buckets = num_buckets; + rep->rep_proto = rep_proto; + + if (!REP_HAS_INLINE_BUCKETS(rep)) + { + vec_validate_aligned(rep->rep_buckets, + rep->rep_n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + } + + REP_DBG(rep, "create"); + + return (rep); +} + +index_t +replicate_create (u32 n_buckets, + dpo_proto_t rep_proto) +{ + return (replicate_get_index(replicate_create_i(n_buckets, rep_proto))); +} + +static inline void +replicate_set_bucket_i (replicate_t *rep, + u32 bucket, + dpo_id_t *buckets, + const dpo_id_t *next) +{ + dpo_stack(DPO_REPLICATE, rep->rep_proto, &buckets[bucket], next); +} + +void +replicate_set_bucket (index_t repi, + u32 bucket, + const dpo_id_t *next) +{ + replicate_t *rep; + dpo_id_t *buckets; + + repi &= ~MPLS_IS_REPLICATE; + rep = replicate_get(repi); + buckets = replicate_get_buckets(rep); + + ASSERT(bucket < rep->rep_n_buckets); + + replicate_set_bucket_i(rep, bucket, buckets, next); +} + +int +replicate_is_drop (const dpo_id_t *dpo) +{ + replicate_t *rep; + index_t repi; + + if (DPO_REPLICATE != dpo->dpoi_type) + return (0); + + repi = dpo->dpoi_index & ~MPLS_IS_REPLICATE; + rep = replicate_get(repi); + + if (1 == rep->rep_n_buckets) + { + return (dpo_is_drop(replicate_get_bucket_i(rep, 0))); + } + return (0); +} + +const dpo_id_t * +replicate_get_bucket (index_t repi, + u32 bucket) +{ + replicate_t *rep; + + repi &= ~MPLS_IS_REPLICATE; + rep = replicate_get(repi); + + return (replicate_get_bucket_i(rep, bucket)); +} + + +static load_balance_path_t * +replicate_multipath_next_hop_fixup (load_balance_path_t *nhs, + dpo_proto_t drop_proto) +{ + if (0 == vec_len(nhs)) + { + load_balance_path_t *nh; + + /* + * we need something for the replicate. so use the drop + */ + vec_add2(nhs, nh, 1); + + nh->path_weight = 1; + dpo_copy(&nh->path_dpo, drop_dpo_get(drop_proto)); + } + + return (nhs); +} + +/* + * Fill in adjacencies in block based on corresponding + * next hop adjacencies. + */ +static void +replicate_fill_buckets (replicate_t *rep, + load_balance_path_t *nhs, + dpo_id_t *buckets, + u32 n_buckets) +{ + load_balance_path_t * nh; + u16 ii, bucket; + + bucket = 0; + + /* + * the next-hops have normalised weights. that means their sum is the number + * of buckets we need to fill. + */ + vec_foreach (nh, nhs) + { + for (ii = 0; ii < nh->path_weight; ii++) + { + ASSERT(bucket < n_buckets); + replicate_set_bucket_i(rep, bucket++, buckets, &nh->path_dpo); + } + } +} + +static inline void +replicate_set_n_buckets (replicate_t *rep, + u32 n_buckets) +{ + rep->rep_n_buckets = n_buckets; +} + +void +replicate_multipath_update (const dpo_id_t *dpo, + load_balance_path_t * next_hops) +{ + load_balance_path_t * nh, * nhs; + dpo_id_t *tmp_dpo; + u32 ii, n_buckets; + replicate_t *rep; + index_t repi; + + ASSERT(DPO_REPLICATE == dpo->dpoi_type); + repi = dpo->dpoi_index & ~MPLS_IS_REPLICATE; + rep = replicate_get(repi); + nhs = replicate_multipath_next_hop_fixup(next_hops, + rep->rep_proto); + n_buckets = vec_len(nhs); + + if (0 == rep->rep_n_buckets) + { + /* + * first time initialisation. no packets inflight, so we can write + * at leisure. + */ + replicate_set_n_buckets(rep, n_buckets); + + if (!REP_HAS_INLINE_BUCKETS(rep)) + vec_validate_aligned(rep->rep_buckets, + rep->rep_n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + + replicate_fill_buckets(rep, nhs, + replicate_get_buckets(rep), + n_buckets); + } + else + { + /* + * This is a modification of an existing replicate. + * We need to ensure that packets in flight see a consistent state, that + * is the number of reported buckets the REP has + * is not more than it actually has. So if the + * number of buckets is increasing, we must update the bucket array first, + * then the reported number. vice-versa if the number of buckets goes down. + */ + if (n_buckets == rep->rep_n_buckets) + { + /* + * no change in the number of buckets. we can simply fill what + * is new over what is old. + */ + replicate_fill_buckets(rep, nhs, + replicate_get_buckets(rep), + n_buckets); + } + else if (n_buckets > rep->rep_n_buckets) + { + /* + * we have more buckets. the old replicate map (if there is one) + * will remain valid, i.e. mapping to indices within range, so we + * update it last. + */ + if (n_buckets > REP_NUM_INLINE_BUCKETS && + rep->rep_n_buckets <= REP_NUM_INLINE_BUCKETS) + { + /* + * the new increased number of buckets is crossing the threshold + * from the inline storage to out-line. Alloc the outline buckets + * first, then fixup the number. then reset the inlines. + */ + ASSERT(NULL == rep->rep_buckets); + vec_validate_aligned(rep->rep_buckets, + n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + + replicate_fill_buckets(rep, nhs, + rep->rep_buckets, + n_buckets); + CLIB_MEMORY_BARRIER(); + replicate_set_n_buckets(rep, n_buckets); + + CLIB_MEMORY_BARRIER(); + + for (ii = 0; ii < REP_NUM_INLINE_BUCKETS; ii++) + { + dpo_reset(&rep->rep_buckets_inline[ii]); + } + } + else + { + if (n_buckets <= REP_NUM_INLINE_BUCKETS) + { + /* + * we are not crossing the threshold and it's still inline buckets. + * we can write the new on the old.. + */ + replicate_fill_buckets(rep, nhs, + replicate_get_buckets(rep), + n_buckets); + CLIB_MEMORY_BARRIER(); + replicate_set_n_buckets(rep, n_buckets); + } + else + { + /* + * we are not crossing the threshold. We need a new bucket array to + * hold the increased number of choices. + */ + dpo_id_t *new_buckets, *old_buckets, *tmp_dpo; + + new_buckets = NULL; + old_buckets = replicate_get_buckets(rep); + + vec_validate_aligned(new_buckets, + n_buckets - 1, + CLIB_CACHE_LINE_BYTES); + + replicate_fill_buckets(rep, nhs, new_buckets, n_buckets); + CLIB_MEMORY_BARRIER(); + rep->rep_buckets = new_buckets; + CLIB_MEMORY_BARRIER(); + replicate_set_n_buckets(rep, n_buckets); + + vec_foreach(tmp_dpo, old_buckets) + { + dpo_reset(tmp_dpo); + } + vec_free(old_buckets); + } + } + } + else + { + /* + * bucket size shrinkage. + */ + if (n_buckets <= REP_NUM_INLINE_BUCKETS && + rep->rep_n_buckets > REP_NUM_INLINE_BUCKETS) + { + /* + * the new decreased number of buckets is crossing the threshold + * from out-line storage to inline: + * 1 - Fill the inline buckets, + * 2 - fixup the number (and this point the inline buckets are + * used). + * 3 - free the outline buckets + */ + replicate_fill_buckets(rep, nhs, + rep->rep_buckets_inline, + n_buckets); + CLIB_MEMORY_BARRIER(); + replicate_set_n_buckets(rep, n_buckets); + CLIB_MEMORY_BARRIER(); + + vec_foreach(tmp_dpo, rep->rep_buckets) + { + dpo_reset(tmp_dpo); + } + vec_free(rep->rep_buckets); + } + else + { + /* + * not crossing the threshold. + * 1 - update the number to the smaller size + * 2 - write the new buckets + * 3 - reset those no longer used. + */ + dpo_id_t *buckets; + u32 old_n_buckets; + + old_n_buckets = rep->rep_n_buckets; + buckets = replicate_get_buckets(rep); + + replicate_set_n_buckets(rep, n_buckets); + CLIB_MEMORY_BARRIER(); + + replicate_fill_buckets(rep, nhs, + buckets, + n_buckets); + + for (ii = n_buckets; ii < old_n_buckets; ii++) + { + dpo_reset(&buckets[ii]); + } + } + } + } + + vec_foreach (nh, nhs) + { + dpo_reset(&nh->path_dpo); + } + vec_free(nhs); +} + +static void +replicate_lock (dpo_id_t *dpo) +{ + replicate_t *rep; + + rep = replicate_get(dpo->dpoi_index); + + rep->rep_locks++; +} + +static void +replicate_destroy (replicate_t *rep) +{ + dpo_id_t *buckets; + int i; + + buckets = replicate_get_buckets(rep); + + for (i = 0; i < rep->rep_n_buckets; i++) + { + dpo_reset(&buckets[i]); + } + + REP_DBG(rep, "destroy"); + if (!REP_HAS_INLINE_BUCKETS(rep)) + { + vec_free(rep->rep_buckets); + } + + pool_put(replicate_pool, rep); +} + +static void +replicate_unlock (dpo_id_t *dpo) +{ + replicate_t *rep; + + rep = replicate_get(dpo->dpoi_index); + + rep->rep_locks--; + + if (0 == rep->rep_locks) + { + replicate_destroy(rep); + } +} + +static void +replicate_mem_show (void) +{ + fib_show_memory_usage("replicate", + pool_elts(replicate_pool), + pool_len(replicate_pool), + sizeof(replicate_t)); +} + +const static dpo_vft_t rep_vft = { + .dv_lock = replicate_lock, + .dv_unlock = replicate_unlock, + .dv_format = format_replicate_dpo, + .dv_mem_show = replicate_mem_show, +}; + +/** + * @brief The per-protocol VLIB graph nodes that are assigned to a replicate + * object. + * + * this means that these graph nodes are ones from which a replicate is the + * parent object in the DPO-graph. + */ +const static char* const replicate_ip4_nodes[] = +{ + "ip4-replicate", + NULL, +}; +const static char* const replicate_ip6_nodes[] = +{ + "ip6-replicate", + NULL, +}; +const static char* const replicate_mpls_nodes[] = +{ + "mpls-replicate", + NULL, +}; + +const static char* const * const replicate_nodes[DPO_PROTO_NUM] = +{ + [DPO_PROTO_IP4] = replicate_ip4_nodes, + [DPO_PROTO_IP6] = replicate_ip6_nodes, + [DPO_PROTO_MPLS] = replicate_mpls_nodes, +}; + +void +replicate_module_init (void) +{ + dpo_register(DPO_REPLICATE, &rep_vft, replicate_nodes); +} + +static clib_error_t * +replicate_show (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + index_t repi = INDEX_INVALID; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%d", &repi)) + ; + else + break; + } + + if (INDEX_INVALID != repi) + { + vlib_cli_output (vm, "%U", format_replicate, repi, + REPLICATE_FORMAT_DETAIL); + } + else + { + replicate_t *rep; + + pool_foreach(rep, replicate_pool, + ({ + vlib_cli_output (vm, "%U", format_replicate, + replicate_get_index(rep), + REPLICATE_FORMAT_NONE); + })); + } + + return 0; +} + +VLIB_CLI_COMMAND (replicate_show_command, static) = { + .path = "show replicate", + .short_help = "show replicate [<index>]", + .function = replicate_show, +}; + +typedef struct replicate_trace_t_ +{ + index_t rep_index; + dpo_id_t dpo; +} replicate_trace_t; + +static uword +replicate_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vlib_combined_counter_main_t * cm = &replicate_main.repm_counters; + replicate_main_t * rm = &replicate_main; + u32 n_left_from, * from, * to_next, next_index; + u32 thread_index = vlib_get_thread_index(); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 next0, ci0, bi0, bucket, repi0; + const replicate_t *rep0; + vlib_buffer_t * b0, *c0; + const dpo_id_t *dpo0; + u8 num_cloned; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + + b0 = vlib_get_buffer (vm, bi0); + repi0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; + rep0 = replicate_get(repi0); + + vlib_increment_combined_counter( + cm, thread_index, repi0, 1, + vlib_buffer_length_in_chain(vm, b0)); + + vec_validate (rm->clones[thread_index], rep0->rep_n_buckets - 1); + + num_cloned = vlib_buffer_clone (vm, bi0, rm->clones[thread_index], rep0->rep_n_buckets, 128); + + if (num_cloned != rep0->rep_n_buckets) + { + vlib_node_increment_counter + (vm, node->node_index, + REPLICATE_DPO_ERROR_BUFFER_ALLOCATION_FAILURE, 1); + } + + for (bucket = 0; bucket < num_cloned; bucket++) + { + ci0 = rm->clones[thread_index][bucket]; + c0 = vlib_get_buffer(vm, ci0); + + to_next[0] = ci0; + to_next += 1; + n_left_to_next -= 1; + + dpo0 = replicate_get_bucket_i(rep0, bucket); + next0 = dpo0->dpoi_next_node; + vnet_buffer (c0)->ip.adj_index[VLIB_TX] = dpo0->dpoi_index; + + if (PREDICT_FALSE(c0->flags & VLIB_BUFFER_IS_TRACED)) + { + replicate_trace_t *t = vlib_add_trace (vm, node, c0, sizeof (*t)); + t->rep_index = repi0; + t->dpo = *dpo0; + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + ci0, next0); + if (PREDICT_FALSE (n_left_to_next == 0)) + { + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + } + } + vec_reset_length (rm->clones[thread_index]); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static u8 * +format_replicate_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + replicate_trace_t *t = va_arg (*args, replicate_trace_t *); + + s = format (s, "replicate: %d via %U", + t->rep_index, + format_dpo_id, &t->dpo, 0); + return s; +} + +static uword +ip4_replicate (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (replicate_inline (vm, node, frame)); +} + +/** + * @brief IP4 replication node + */ +VLIB_REGISTER_NODE (ip4_replicate_node) = { + .function = ip4_replicate, + .name = "ip4-replicate", + .vector_size = sizeof (u32), + + .n_errors = ARRAY_LEN(replicate_dpo_error_strings), + .error_strings = replicate_dpo_error_strings, + + .format_trace = format_replicate_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "ip4-drop", + }, +}; + +static uword +ip6_replicate (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (replicate_inline (vm, node, frame)); +} + +/** + * @brief IPv6 replication node + */ +VLIB_REGISTER_NODE (ip6_replicate_node) = { + .function = ip6_replicate, + .name = "ip6-replicate", + .vector_size = sizeof (u32), + + .n_errors = ARRAY_LEN(replicate_dpo_error_strings), + .error_strings = replicate_dpo_error_strings, + + .format_trace = format_replicate_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "ip6-drop", + }, +}; + +static uword +mpls_replicate (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return (replicate_inline (vm, node, frame)); +} + +/** + * @brief MPLS replication node + */ +VLIB_REGISTER_NODE (mpls_replicate_node) = { + .function = mpls_replicate, + .name = "mpls-replicate", + .vector_size = sizeof (u32), + + .n_errors = ARRAY_LEN(replicate_dpo_error_strings), + .error_strings = replicate_dpo_error_strings, + + .format_trace = format_replicate_trace, + .n_next_nodes = 1, + .next_nodes = { + [0] = "mpls-drop", + }, +}; + +clib_error_t * +replicate_dpo_init (vlib_main_t * vm) +{ + replicate_main_t * rm = &replicate_main; + + vec_validate (rm->clones, vlib_num_workers()); + + return 0; +} + +VLIB_INIT_FUNCTION (replicate_dpo_init); diff --git a/src/vnet/dpo/replicate_dpo.h b/src/vnet/dpo/replicate_dpo.h new file mode 100644 index 00000000..7383184a --- /dev/null +++ b/src/vnet/dpo/replicate_dpo.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * @brief + * + */ + +#ifndef __REPLICATE_DPO_H__ +#define __REPLICATE_DPO_H__ + +#include <vlib/vlib.h> +#include <vnet/ip/lookup.h> +#include <vnet/dpo/dpo.h> +#include <vnet/dpo/load_balance.h> +#include <vnet/fib/fib_types.h> +#include <vnet/mpls/mpls_types.h> + +/** + * replicate main + */ +typedef struct replicate_main_t_ +{ + vlib_combined_counter_main_t repm_counters; + + /* per-cpu vector of cloned packets */ + u32 **clones; +} replicate_main_t; + +extern replicate_main_t replicate_main; + +/** + * The number of buckets that a load-balance object can have and still + * fit in one cache-line + */ +#define REP_NUM_INLINE_BUCKETS 4 + +/** + * The FIB DPO provieds; + * - load-balancing over the next DPOs in the chain/graph + * - per-route counters + */ +typedef struct replicate_t_ { + /** + * number of buckets in the load-balance. always a power of 2. + */ + u16 rep_n_buckets; + + /** + * The protocol of packets that traverse this REP. + * need in combination with the flow hash config to determine how to hash. + * u8. + */ + dpo_proto_t rep_proto; + + /** + * The number of locks, which is approximately the number of users, + * of this load-balance. + * Load-balance objects of via-entries are heavily shared by recursives, + * so the lock count is a u32. + */ + u32 rep_locks; + + /** + * Vector of buckets containing the next DPOs, sized as repo_num + */ + dpo_id_t *rep_buckets; + + /** + * The rest of the cache line is used for buckets. In the common case + * where there there are less than 4 buckets, then the buckets are + * on the same cachlie and we save ourselves a pointer dereferance in + * the data-path. + */ + dpo_id_t rep_buckets_inline[REP_NUM_INLINE_BUCKETS]; +} replicate_t; + +STATIC_ASSERT(sizeof(replicate_t) <= CLIB_CACHE_LINE_BYTES, + "A replicate object size exceeds one cachline"); + +/** + * Flags controlling load-balance formatting/display + */ +typedef enum replicate_format_flags_t_ { + REPLICATE_FORMAT_NONE, + REPLICATE_FORMAT_DETAIL = (1 << 0), +} replicate_format_flags_t; + +extern index_t replicate_create(u32 num_buckets, + dpo_proto_t rep_proto); +extern void replicate_multipath_update( + const dpo_id_t *dpo, + load_balance_path_t *next_hops); + +extern void replicate_set_bucket(index_t repi, + u32 bucket, + const dpo_id_t *next); + +extern u8* format_replicate(u8 * s, va_list * args); + +extern const dpo_id_t *replicate_get_bucket(index_t repi, + u32 bucket); +extern int replicate_is_drop(const dpo_id_t *dpo); + +/** + * The encapsulation breakages are for fast DP access + */ +extern replicate_t *replicate_pool; +static inline replicate_t* +replicate_get (index_t repi) +{ + repi &= ~MPLS_IS_REPLICATE; + return (pool_elt_at_index(replicate_pool, repi)); +} + +#define REP_HAS_INLINE_BUCKETS(_rep) \ + ((_rep)->rep_n_buckets <= REP_NUM_INLINE_BUCKETS) + +static inline const dpo_id_t * +replicate_get_bucket_i (const replicate_t *rep, + u32 bucket) +{ + ASSERT(bucket < rep->rep_n_buckets); + + if (PREDICT_TRUE(REP_HAS_INLINE_BUCKETS(rep))) + { + return (&rep->rep_buckets_inline[bucket]); + } + else + { + return (&rep->rep_buckets[bucket]); + } +} + +extern void replicate_module_init(void); + +#endif |