diff options
author | Steven Luong <sluong@cisco.com> | 2022-10-24 09:10:59 -0700 |
---|---|---|
committer | Dave Wallace <dwallacelf@gmail.com> | 2023-01-19 21:37:25 +0000 |
commit | 8bd4db5996ba1144f659ea5341f1c2727c650bcd (patch) | |
tree | 05ebf7985499bf10b842faf68af2c1d4a5d7c138 /src/plugins | |
parent | dd2eff6d060c787e4a49863c9b97bde7215f7740 (diff) |
vxlan: convert vxlan to a plugin
per https://jira.fd.io/browse/VPP-2058
Type: improvement
Signed-off-by: Steven Luong <sluong@cisco.com>
Change-Id: Ica0828de218d25ada2d0d1491e373c3b78179ac1
Diffstat (limited to 'src/plugins')
-rw-r--r-- | src/plugins/dpdk/device/flow.c | 2 | ||||
-rw-r--r-- | src/plugins/nsh/FEATURE.yaml | 1 | ||||
-rw-r--r-- | src/plugins/nsh/nsh.c | 16 | ||||
-rw-r--r-- | src/plugins/nsh/nsh_pop.c | 1 | ||||
-rw-r--r-- | src/plugins/vxlan/CMakeLists.txt | 29 | ||||
-rw-r--r-- | src/plugins/vxlan/FEATURE.yaml | 14 | ||||
-rw-r--r-- | src/plugins/vxlan/decap.c | 1330 | ||||
-rw-r--r-- | src/plugins/vxlan/dir.dox | 24 | ||||
-rw-r--r-- | src/plugins/vxlan/encap.c | 540 | ||||
-rw-r--r-- | src/plugins/vxlan/plugin.c | 22 | ||||
-rw-r--r-- | src/plugins/vxlan/vxlan.api | 198 | ||||
-rw-r--r-- | src/plugins/vxlan/vxlan.c | 1349 | ||||
-rw-r--r-- | src/plugins/vxlan/vxlan.h | 242 | ||||
-rw-r--r-- | src/plugins/vxlan/vxlan_api.c | 376 | ||||
-rw-r--r-- | src/plugins/vxlan/vxlan_error.def | 17 | ||||
-rw-r--r-- | src/plugins/vxlan/vxlan_packet.h | 80 |
16 files changed, 4236 insertions, 5 deletions
diff --git a/src/plugins/dpdk/device/flow.c b/src/plugins/dpdk/device/flow.c index 9f765a6f845..ae76b3df169 100644 --- a/src/plugins/dpdk/device/flow.c +++ b/src/plugins/dpdk/device/flow.c @@ -21,7 +21,7 @@ #include <vnet/ip/ip.h> #include <vnet/ethernet/ethernet.h> #include <vnet/ethernet/arp_packet.h> -#include <vnet/vxlan/vxlan.h> +#include <vxlan/vxlan.h> #include <dpdk/device/dpdk.h> #include <dpdk/device/dpdk_priv.h> #include <vppinfra/error.h> diff --git a/src/plugins/nsh/FEATURE.yaml b/src/plugins/nsh/FEATURE.yaml index a6ef3749952..986008e41a5 100644 --- a/src/plugins/nsh/FEATURE.yaml +++ b/src/plugins/nsh/FEATURE.yaml @@ -8,6 +8,7 @@ features: - NSH Proxy - NSH OAM - NSH Metadata + - Requires vxlan_plugin.so to run description: "NSH for SFC" state: production diff --git a/src/plugins/nsh/nsh.c b/src/plugins/nsh/nsh.c index ea084e4a553..391fa8dbac5 100644 --- a/src/plugins/nsh/nsh.c +++ b/src/plugins/nsh/nsh.c @@ -19,7 +19,7 @@ #include <vnet/plugin/plugin.h> #include <nsh/nsh.h> #include <vnet/gre/gre.h> -#include <vnet/vxlan/vxlan.h> +#include <vxlan/vxlan.h> #include <vnet/vxlan-gpe/vxlan_gpe.h> #include <vnet/l2/l2_classify.h> #include <vnet/adj/adj.h> @@ -188,6 +188,7 @@ nsh_init (vlib_main_t * vm) nsh_main_t *nm = &nsh_main; clib_error_t *error = 0; uword next_node; + vlib_node_registration_t *vxlan4_input, *vxlan6_input; /* Init the main structures from VPP */ nm->vlib_main = vm; @@ -250,8 +251,17 @@ nsh_init (vlib_main_t * vm) nsh_aware_vnf_proxy_node.index); /* Add NSH-Proxy support */ - vlib_node_add_next (vm, vxlan4_input_node.index, nm->nsh_proxy_node_index); - vlib_node_add_next (vm, vxlan6_input_node.index, nm->nsh_proxy_node_index); + vxlan4_input = + vlib_get_plugin_symbol ("vxlan_plugin.so", "vxlan4_input_node"); + vxlan6_input = + vlib_get_plugin_symbol ("vxlan_plugin.so", "vxlan6_input_node"); + if (vxlan4_input == 0 || vxlan6_input == 0) + { + error = clib_error_return (0, "vxlan_plugin.so is not loaded"); + return error; + } + vlib_node_add_next (vm, vxlan4_input->index, nm->nsh_proxy_node_index); + vlib_node_add_next (vm, vxlan6_input->index, nm->nsh_proxy_node_index); /* Add NSH-Classifier support */ vlib_node_add_next (vm, ip4_classify_node.index, diff --git a/src/plugins/nsh/nsh_pop.c b/src/plugins/nsh/nsh_pop.c index 90b8a73b5fb..8de319e158b 100644 --- a/src/plugins/nsh/nsh_pop.c +++ b/src/plugins/nsh/nsh_pop.c @@ -19,7 +19,6 @@ #include <vnet/plugin/plugin.h> #include <nsh/nsh.h> #include <vnet/gre/packet.h> -#include <vnet/vxlan/vxlan.h> #include <vnet/vxlan-gpe/vxlan_gpe.h> #include <vnet/l2/l2_classify.h> diff --git a/src/plugins/vxlan/CMakeLists.txt b/src/plugins/vxlan/CMakeLists.txt new file mode 100644 index 00000000000..bd0272a868e --- /dev/null +++ b/src/plugins/vxlan/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright (c) 2022 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_vpp_plugin(vxlan + SOURCES + vxlan.c + encap.c + decap.c + vxlan_api.c + plugin.c + vxlan.h + vxlan_packet.h + + MULTIARCH_SOURCES + encap.c + + API_FILES + vxlan.api +) diff --git a/src/plugins/vxlan/FEATURE.yaml b/src/plugins/vxlan/FEATURE.yaml new file mode 100644 index 00000000000..dc7d21b010e --- /dev/null +++ b/src/plugins/vxlan/FEATURE.yaml @@ -0,0 +1,14 @@ +--- +name: Virtual eXtensible LAN +maintainer: John Lo <loj@cisco.com> +features: + - VXLAN tunnel for support of L2 overlay/virtual networks (RFC-7348) + - Support either IPv4 or IPv6 underlay network VTEPs + - Flooding via headend replication if all VXLAN tunnels in BD are unicast ones + - Multicast VXLAN tunnel can be added to BD to flood via IP multicast + - VXLAN encap with flow-hashed source port for better underlay IP load balance + - VXLAN decap optimization via vxlan-bypass IP feature on underlay interfaces + - VXLAN decap HW offload using flow director with DPDK on Intel Fortville NICs +description: "Virtual eXtensible LAN (VXLAN) tunnels support L2 overlay networks that span L3 networks" +state: production +properties: [API, CLI, MULTITHREAD] diff --git a/src/plugins/vxlan/decap.c b/src/plugins/vxlan/decap.c new file mode 100644 index 00000000000..f5d1efa2b86 --- /dev/null +++ b/src/plugins/vxlan/decap.c @@ -0,0 +1,1330 @@ +/* + * decap.c: vxlan tunnel decap packet processing + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vxlan/vxlan.h> +#include <vnet/udp/udp_local.h> + +#ifndef CLIB_MARCH_VARIANT +__clib_export vlib_node_registration_t vxlan4_input_node; +__clib_export vlib_node_registration_t vxlan6_input_node; +#endif + +typedef struct +{ + u32 next_index; + u32 tunnel_index; + u32 error; + u32 vni; +} vxlan_rx_trace_t; + +static u8 * +format_vxlan_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + vxlan_rx_trace_t *t = va_arg (*args, vxlan_rx_trace_t *); + + if (t->tunnel_index == ~0) + return format (s, "VXLAN decap error - tunnel for vni %d does not exist", + t->vni); + return format (s, "VXLAN decap from vxlan_tunnel%d vni %d next %d error %d", + t->tunnel_index, t->vni, t->next_index, t->error); +} + +typedef vxlan4_tunnel_key_t last_tunnel_cache4; + +static const vxlan_decap_info_t decap_not_found = { + .sw_if_index = ~0, + .next_index = VXLAN_INPUT_NEXT_DROP, + .error = VXLAN_ERROR_NO_SUCH_TUNNEL +}; + +static const vxlan_decap_info_t decap_bad_flags = { + .sw_if_index = ~0, + .next_index = VXLAN_INPUT_NEXT_DROP, + .error = VXLAN_ERROR_BAD_FLAGS +}; + +always_inline vxlan_decap_info_t +vxlan4_find_tunnel (vxlan_main_t * vxm, last_tunnel_cache4 * cache, + u32 fib_index, ip4_header_t * ip4_0, + vxlan_header_t * vxlan0, u32 * stats_sw_if_index) +{ + if (PREDICT_FALSE (vxlan0->flags != VXLAN_FLAGS_I)) + return decap_bad_flags; + + /* Make sure VXLAN tunnel exist according to packet S/D IP, UDP port, VRF, + * and VNI */ + u32 dst = ip4_0->dst_address.as_u32; + u32 src = ip4_0->src_address.as_u32; + udp_header_t *udp = ip4_next_header (ip4_0); + vxlan4_tunnel_key_t key4 = { + .key[0] = ((u64) dst << 32) | src, + .key[1] = ((u64) udp->dst_port << 48) | ((u64) fib_index << 32) | + vxlan0->vni_reserved, + }; + + if (PREDICT_TRUE + (key4.key[0] == cache->key[0] && key4.key[1] == cache->key[1])) + { + /* cache hit */ + vxlan_decap_info_t di = {.as_u64 = cache->value }; + *stats_sw_if_index = di.sw_if_index; + return di; + } + + int rv = clib_bihash_search_inline_16_8 (&vxm->vxlan4_tunnel_by_key, &key4); + if (PREDICT_TRUE (rv == 0)) + { + *cache = key4; + vxlan_decap_info_t di = {.as_u64 = key4.value }; + *stats_sw_if_index = di.sw_if_index; + return di; + } + + /* try multicast */ + if (PREDICT_TRUE (!ip4_address_is_multicast (&ip4_0->dst_address))) + return decap_not_found; + + /* search for mcast decap info by mcast address */ + key4.key[0] = dst; + rv = clib_bihash_search_inline_16_8 (&vxm->vxlan4_tunnel_by_key, &key4); + if (rv != 0) + return decap_not_found; + + /* search for unicast tunnel using the mcast tunnel local(src) ip */ + vxlan_decap_info_t mdi = {.as_u64 = key4.value }; + key4.key[0] = ((u64) mdi.local_ip.as_u32 << 32) | src; + rv = clib_bihash_search_inline_16_8 (&vxm->vxlan4_tunnel_by_key, &key4); + if (PREDICT_FALSE (rv != 0)) + return decap_not_found; + + /* mcast traffic does not update the cache */ + *stats_sw_if_index = mdi.sw_if_index; + vxlan_decap_info_t di = {.as_u64 = key4.value }; + return di; +} + +typedef vxlan6_tunnel_key_t last_tunnel_cache6; + +always_inline vxlan_decap_info_t +vxlan6_find_tunnel (vxlan_main_t * vxm, last_tunnel_cache6 * cache, + u32 fib_index, ip6_header_t * ip6_0, + vxlan_header_t * vxlan0, u32 * stats_sw_if_index) +{ + if (PREDICT_FALSE (vxlan0->flags != VXLAN_FLAGS_I)) + return decap_bad_flags; + + /* Make sure VXLAN tunnel exist according to packet SIP, UDP port, VRF, and + * VNI */ + udp_header_t *udp = ip6_next_header (ip6_0); + vxlan6_tunnel_key_t key6 = { + .key[0] = ip6_0->src_address.as_u64[0], + .key[1] = ip6_0->src_address.as_u64[1], + .key[2] = ((u64) udp->dst_port << 48) | ((u64) fib_index << 32) | + vxlan0->vni_reserved, + }; + + if (PREDICT_FALSE + (clib_bihash_key_compare_24_8 (key6.key, cache->key) == 0)) + { + int rv = + clib_bihash_search_inline_24_8 (&vxm->vxlan6_tunnel_by_key, &key6); + if (PREDICT_FALSE (rv != 0)) + return decap_not_found; + + *cache = key6; + } + vxlan_tunnel_t *t0 = pool_elt_at_index (vxm->tunnels, cache->value); + + /* Validate VXLAN tunnel SIP against packet DIP */ + if (PREDICT_TRUE (ip6_address_is_equal (&ip6_0->dst_address, &t0->src.ip6))) + *stats_sw_if_index = t0->sw_if_index; + else + { + /* try multicast */ + if (PREDICT_TRUE (!ip6_address_is_multicast (&ip6_0->dst_address))) + return decap_not_found; + + /* Make sure mcast VXLAN tunnel exist by packet DIP and VNI */ + key6.key[0] = ip6_0->dst_address.as_u64[0]; + key6.key[1] = ip6_0->dst_address.as_u64[1]; + int rv = + clib_bihash_search_inline_24_8 (&vxm->vxlan6_tunnel_by_key, &key6); + if (PREDICT_FALSE (rv != 0)) + return decap_not_found; + + vxlan_tunnel_t *mcast_t0 = pool_elt_at_index (vxm->tunnels, key6.value); + *stats_sw_if_index = mcast_t0->sw_if_index; + } + + vxlan_decap_info_t di = { + .sw_if_index = t0->sw_if_index, + .next_index = t0->decap_next_index, + }; + return di; +} + +always_inline uword +vxlan_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, u32 is_ip4) +{ + vxlan_main_t *vxm = &vxlan_main; + vnet_main_t *vnm = vxm->vnet_main; + vnet_interface_main_t *im = &vnm->interface_main; + vlib_combined_counter_main_t *rx_counter = + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX; + last_tunnel_cache4 last4; + last_tunnel_cache6 last6; + u32 pkts_dropped = 0; + u32 thread_index = vlib_get_thread_index (); + + if (is_ip4) + clib_memset (&last4, 0xff, sizeof last4); + else + clib_memset (&last6, 0xff, sizeof last6); + + u32 *from = vlib_frame_vector_args (from_frame); + u32 n_left_from = from_frame->n_vectors; + + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; + vlib_get_buffers (vm, from, bufs, n_left_from); + + u32 stats_if0 = ~0, stats_if1 = ~0; + u16 nexts[VLIB_FRAME_SIZE], *next = nexts; + while (n_left_from >= 4) + { + /* Prefetch next iteration. */ + vlib_prefetch_buffer_header (b[2], LOAD); + vlib_prefetch_buffer_header (b[3], LOAD); + + /* udp leaves current_data pointing at the vxlan header */ + void *cur0 = vlib_buffer_get_current (b[0]); + void *cur1 = vlib_buffer_get_current (b[1]); + vxlan_header_t *vxlan0 = cur0; + vxlan_header_t *vxlan1 = cur1; + + + ip4_header_t *ip4_0, *ip4_1; + ip6_header_t *ip6_0, *ip6_1; + if (is_ip4) + { + ip4_0 = cur0 - sizeof (udp_header_t) - sizeof (ip4_header_t); + ip4_1 = cur1 - sizeof (udp_header_t) - sizeof (ip4_header_t); + } + else + { + ip6_0 = cur0 - sizeof (udp_header_t) - sizeof (ip6_header_t); + ip6_1 = cur1 - sizeof (udp_header_t) - sizeof (ip6_header_t); + } + + /* pop vxlan */ + vlib_buffer_advance (b[0], sizeof *vxlan0); + vlib_buffer_advance (b[1], sizeof *vxlan1); + + u32 fi0 = vlib_buffer_get_ip_fib_index (b[0], is_ip4); + u32 fi1 = vlib_buffer_get_ip_fib_index (b[1], is_ip4); + + vxlan_decap_info_t di0 = is_ip4 ? + vxlan4_find_tunnel (vxm, &last4, fi0, ip4_0, vxlan0, &stats_if0) : + vxlan6_find_tunnel (vxm, &last6, fi0, ip6_0, vxlan0, &stats_if0); + vxlan_decap_info_t di1 = is_ip4 ? + vxlan4_find_tunnel (vxm, &last4, fi1, ip4_1, vxlan1, &stats_if1) : + vxlan6_find_tunnel (vxm, &last6, fi1, ip6_1, vxlan1, &stats_if1); + + /* Prefetch next iteration. */ + clib_prefetch_load (b[2]->data); + clib_prefetch_load (b[3]->data); + + u32 len0 = vlib_buffer_length_in_chain (vm, b[0]); + u32 len1 = vlib_buffer_length_in_chain (vm, b[1]); + + next[0] = di0.next_index; + next[1] = di1.next_index; + + u8 any_error = di0.error | di1.error; + if (PREDICT_TRUE (any_error == 0)) + { + /* Required to make the l2 tag push / pop code work on l2 subifs */ + vnet_update_l2_len (b[0]); + vnet_update_l2_len (b[1]); + /* Set packet input sw_if_index to unicast VXLAN tunnel for learning */ + vnet_buffer (b[0])->sw_if_index[VLIB_RX] = di0.sw_if_index; + vnet_buffer (b[1])->sw_if_index[VLIB_RX] = di1.sw_if_index; + vlib_increment_combined_counter (rx_counter, thread_index, + stats_if0, 1, len0); + vlib_increment_combined_counter (rx_counter, thread_index, + stats_if1, 1, len1); + } + else + { + if (di0.error == 0) + { + vnet_update_l2_len (b[0]); + vnet_buffer (b[0])->sw_if_index[VLIB_RX] = di0.sw_if_index; + vlib_increment_combined_counter (rx_counter, thread_index, + stats_if0, 1, len0); + } + else + { + b[0]->error = node->errors[di0.error]; + pkts_dropped++; + } + + if (di1.error == 0) + { + vnet_update_l2_len (b[1]); + vnet_buffer (b[1])->sw_if_index[VLIB_RX] = di1.sw_if_index; + vlib_increment_combined_counter (rx_counter, thread_index, + stats_if1, 1, len1); + } + else + { + b[1]->error = node->errors[di1.error]; + pkts_dropped++; + } + } + + if (PREDICT_FALSE (b[0]->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_rx_trace_t *tr = + vlib_add_trace (vm, node, b[0], sizeof (*tr)); + tr->next_index = next[0]; + tr->error = di0.error; + tr->tunnel_index = di0.sw_if_index == ~0 ? + ~0 : vxm->tunnel_index_by_sw_if_index[di0.sw_if_index]; + tr->vni = vnet_get_vni (vxlan0); + } + if (PREDICT_FALSE (b[1]->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_rx_trace_t *tr = + vlib_add_trace (vm, node, b[1], sizeof (*tr)); + tr->next_index = next[1]; + tr->error = di1.error; + tr->tunnel_index = di1.sw_if_index == ~0 ? + ~0 : vxm->tunnel_index_by_sw_if_index[di1.sw_if_index]; + tr->vni = vnet_get_vni (vxlan1); + } + b += 2; + next += 2; + n_left_from -= 2; + } + + while (n_left_from > 0) + { + /* udp leaves current_data pointing at the vxlan header */ + void *cur0 = vlib_buffer_get_current (b[0]); + vxlan_header_t *vxlan0 = cur0; + ip4_header_t *ip4_0; + ip6_header_t *ip6_0; + if (is_ip4) + ip4_0 = cur0 - sizeof (udp_header_t) - sizeof (ip4_header_t); + else + ip6_0 = cur0 - sizeof (udp_header_t) - sizeof (ip6_header_t); + + /* pop (ip, udp, vxlan) */ + vlib_buffer_advance (b[0], sizeof (*vxlan0)); + + u32 fi0 = vlib_buffer_get_ip_fib_index (b[0], is_ip4); + + vxlan_decap_info_t di0 = is_ip4 ? + vxlan4_find_tunnel (vxm, &last4, fi0, ip4_0, vxlan0, &stats_if0) : + vxlan6_find_tunnel (vxm, &last6, fi0, ip6_0, vxlan0, &stats_if0); + + uword len0 = vlib_buffer_length_in_chain (vm, b[0]); + + next[0] = di0.next_index; + + /* Validate VXLAN tunnel encap-fib index against packet */ + if (di0.error == 0) + { + /* Required to make the l2 tag push / pop code work on l2 subifs */ + vnet_update_l2_len (b[0]); + + /* Set packet input sw_if_index to unicast VXLAN tunnel for learning */ + vnet_buffer (b[0])->sw_if_index[VLIB_RX] = di0.sw_if_index; + + vlib_increment_combined_counter (rx_counter, thread_index, + stats_if0, 1, len0); + } + else + { + b[0]->error = node->errors[di0.error]; + pkts_dropped++; + } + + if (PREDICT_FALSE (b[0]->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_rx_trace_t *tr + = vlib_add_trace (vm, node, b[0], sizeof (*tr)); + tr->next_index = next[0]; + tr->error = di0.error; + tr->tunnel_index = di0.sw_if_index == ~0 ? + ~0 : vxm->tunnel_index_by_sw_if_index[di0.sw_if_index]; + tr->vni = vnet_get_vni (vxlan0); + } + b += 1; + next += 1; + n_left_from -= 1; + } + vlib_buffer_enqueue_to_next (vm, node, from, nexts, from_frame->n_vectors); + /* Do we still need this now that tunnel tx stats is kept? */ + u32 node_idx = is_ip4 ? vxlan4_input_node.index : vxlan6_input_node.index; + vlib_node_increment_counter (vm, node_idx, VXLAN_ERROR_DECAPSULATED, + from_frame->n_vectors - pkts_dropped); + + return from_frame->n_vectors; +} + +VLIB_NODE_FN (vxlan4_input_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return vxlan_input (vm, node, from_frame, /* is_ip4 */ 1); +} + +VLIB_NODE_FN (vxlan6_input_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return vxlan_input (vm, node, from_frame, /* is_ip4 */ 0); +} + +static char *vxlan_error_strings[] = { +#define vxlan_error(n,s) s, +#include <vxlan/vxlan_error.def> +#undef vxlan_error +}; + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vxlan4_input_node) = +{ + .name = "vxlan4-input", + .vector_size = sizeof (u32), + .n_errors = VXLAN_N_ERROR, + .error_strings = vxlan_error_strings, + .n_next_nodes = VXLAN_INPUT_N_NEXT, + .format_trace = format_vxlan_rx_trace, + .next_nodes = { +#define _(s,n) [VXLAN_INPUT_NEXT_##s] = n, + foreach_vxlan_input_next +#undef _ + }, +}; + +VLIB_REGISTER_NODE (vxlan6_input_node) = +{ + .name = "vxlan6-input", + .vector_size = sizeof (u32), + .n_errors = VXLAN_N_ERROR, + .error_strings = vxlan_error_strings, + .n_next_nodes = VXLAN_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [VXLAN_INPUT_NEXT_##s] = n, + foreach_vxlan_input_next +#undef _ + }, + .format_trace = format_vxlan_rx_trace, +}; +/* *INDENT-ON* */ + +typedef enum +{ + IP_VXLAN_BYPASS_NEXT_DROP, + IP_VXLAN_BYPASS_NEXT_VXLAN, + IP_VXLAN_BYPASS_N_NEXT, +} ip_vxlan_bypass_next_t; + +always_inline uword +ip_vxlan_bypass_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, u32 is_ip4) +{ + vxlan_main_t *vxm = &vxlan_main; + u32 *from, *to_next, n_left_from, n_left_to_next, next_index; + vlib_node_runtime_t *error_node = + vlib_node_get_runtime (vm, ip4_input_node.index); + vtep4_key_t last_vtep4; /* last IPv4 address / fib index + matching a local VTEP address */ + vtep6_key_t last_vtep6; /* last IPv6 address / fib index + matching a local VTEP address */ + vlib_buffer_t *bufs[VLIB_FRAME_SIZE], **b = bufs; + + last_tunnel_cache4 last4; + last_tunnel_cache6 last6; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + vlib_get_buffers (vm, from, bufs, n_left_from); + + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip4_forward_next_trace (vm, node, frame, VLIB_TX); + + if (is_ip4) + { + vtep4_key_init (&last_vtep4); + clib_memset (&last4, 0xff, sizeof last4); + } + else + { + vtep6_key_init (&last_vtep6); + clib_memset (&last6, 0xff, sizeof last6); + } + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t *b0, *b1; + ip4_header_t *ip40, *ip41; + ip6_header_t *ip60, *ip61; + udp_header_t *udp0, *udp1; + vxlan_header_t *vxlan0, *vxlan1; + u32 bi0, ip_len0, udp_len0, flags0, next0; + u32 bi1, ip_len1, udp_len1, flags1, next1; + i32 len_diff0, len_diff1; + u8 error0, good_udp0, proto0; + u8 error1, good_udp1, proto1; + u32 stats_if0 = ~0, stats_if1 = ~0; + + /* Prefetch next iteration. */ + { + vlib_prefetch_buffer_header (b[2], LOAD); + vlib_prefetch_buffer_header (b[3], LOAD); + + CLIB_PREFETCH (b[2]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[3]->data, 2 * CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = to_next[0] = from[0]; + bi1 = to_next[1] = from[1]; + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + b0 = b[0]; + b1 = b[1]; + b += 2; + if (is_ip4) + { + ip40 = vlib_buffer_get_current (b0); + ip41 = vlib_buffer_get_current (b1); + } + else + { + ip60 = vlib_buffer_get_current (b0); + ip61 = vlib_buffer_get_current (b1); + } + + /* Setup packet for next IP feature */ + vnet_feature_next (&next0, b0); + vnet_feature_next (&next1, b1); + + if (is_ip4) + { + /* Treat IP frag packets as "experimental" protocol for now + until support of IP frag reassembly is implemented */ + proto0 = ip4_is_fragment (ip40) ? 0xfe : ip40->protocol; + proto1 = ip4_is_fragment (ip41) ? 0xfe : ip41->protocol; + } + else + { + proto0 = ip60->protocol; + proto1 = ip61->protocol; + } + + /* Process packet 0 */ + if (proto0 != IP_PROTOCOL_UDP) + goto exit0; /* not UDP packet */ + + if (is_ip4) + udp0 = ip4_next_header (ip40); + else + udp0 = ip6_next_header (ip60); + + u32 fi0 = vlib_buffer_get_ip_fib_index (b0, is_ip4); + vxlan0 = vlib_buffer_get_current (b0) + sizeof (udp_header_t) + + sizeof (ip4_header_t); + + vxlan_decap_info_t di0 = + is_ip4 ? + vxlan4_find_tunnel (vxm, &last4, fi0, ip40, vxlan0, &stats_if0) : + vxlan6_find_tunnel (vxm, &last6, fi0, ip60, vxlan0, &stats_if0); + + if (PREDICT_FALSE (di0.sw_if_index == ~0)) + goto exit0; /* unknown interface */ + + /* Validate DIP against VTEPs */ + if (is_ip4) + { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector (&vxm->vtep_table, b0, ip40, &last_vtep4, + &vxm->vtep4_u512)) +#else + if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4)) +#endif + goto exit0; /* no local VTEP for VXLAN packet */ + } + else + { + if (!vtep6_check (&vxm->vtep_table, b0, ip60, &last_vtep6)) + goto exit0; /* no local VTEP for VXLAN packet */ + } + + flags0 = b0->flags; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_udp0 |= udp0->checksum == 0; + + /* Verify UDP length */ + if (is_ip4) + ip_len0 = clib_net_to_host_u16 (ip40->length); + else + ip_len0 = clib_net_to_host_u16 (ip60->payload_length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + len_diff0 = ip_len0 - udp_len0; + + /* Verify UDP checksum */ + if (PREDICT_FALSE (!good_udp0)) + { + if (is_ip4) + flags0 = ip4_tcp_udp_validate_checksum (vm, b0); + else + flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + } + + if (is_ip4) + { + error0 = good_udp0 ? 0 : IP4_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP4_ERROR_UDP_LENGTH; + } + else + { + error0 = good_udp0 ? 0 : IP6_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP6_ERROR_UDP_LENGTH; + } + + next0 = error0 ? + IP_VXLAN_BYPASS_NEXT_DROP : IP_VXLAN_BYPASS_NEXT_VXLAN; + b0->error = error0 ? error_node->errors[error0] : 0; + + /* vxlan-input node expect current at VXLAN header */ + if (is_ip4) + vlib_buffer_advance (b0, + sizeof (ip4_header_t) + + sizeof (udp_header_t)); + else + vlib_buffer_advance (b0, + sizeof (ip6_header_t) + + sizeof (udp_header_t)); + + exit0: + /* Process packet 1 */ + if (proto1 != IP_PROTOCOL_UDP) + goto exit1; /* not UDP packet */ + + if (is_ip4) + udp1 = ip4_next_header (ip41); + else + udp1 = ip6_next_header (ip61); + + u32 fi1 = vlib_buffer_get_ip_fib_index (b1, is_ip4); + vxlan1 = vlib_buffer_get_current (b1) + sizeof (udp_header_t) + + sizeof (ip4_header_t); + + vxlan_decap_info_t di1 = + is_ip4 ? + vxlan4_find_tunnel (vxm, &last4, fi1, ip41, vxlan1, &stats_if1) : + vxlan6_find_tunnel (vxm, &last6, fi1, ip61, vxlan1, &stats_if1); + + if (PREDICT_FALSE (di1.sw_if_index == ~0)) + goto exit1; /* unknown interface */ + + /* Validate DIP against VTEPs */ + if (is_ip4) + { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector (&vxm->vtep_table, b1, ip41, &last_vtep4, + &vxm->vtep4_u512)) +#else + if (!vtep4_check (&vxm->vtep_table, b1, ip41, &last_vtep4)) +#endif + goto exit1; /* no local VTEP for VXLAN packet */ + } + else + { + if (!vtep6_check (&vxm->vtep_table, b1, ip61, &last_vtep6)) + goto exit1; /* no local VTEP for VXLAN packet */ + } + + flags1 = b1->flags; + good_udp1 = (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_udp1 |= udp1->checksum == 0; + + /* Verify UDP length */ + if (is_ip4) + ip_len1 = clib_net_to_host_u16 (ip41->length); + else + ip_len1 = clib_net_to_host_u16 (ip61->payload_length); + udp_len1 = clib_net_to_host_u16 (udp1->length); + len_diff1 = ip_len1 - udp_len1; + + /* Verify UDP checksum */ + if (PREDICT_FALSE (!good_udp1)) + { + if (is_ip4) + flags1 = ip4_tcp_udp_validate_checksum (vm, b1); + else + flags1 = ip6_tcp_udp_icmp_validate_checksum (vm, b1); + good_udp1 = (flags1 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + } + + if (is_ip4) + { + error1 = good_udp1 ? 0 : IP4_ERROR_UDP_CHECKSUM; + error1 = (len_diff1 >= 0) ? error1 : IP4_ERROR_UDP_LENGTH; + } + else + { + error1 = good_udp1 ? 0 : IP6_ERROR_UDP_CHECKSUM; + error1 = (len_diff1 >= 0) ? error1 : IP6_ERROR_UDP_LENGTH; + } + + next1 = error1 ? + IP_VXLAN_BYPASS_NEXT_DROP : IP_VXLAN_BYPASS_NEXT_VXLAN; + b1->error = error1 ? error_node->errors[error1] : 0; + + /* vxlan-input node expect current at VXLAN header */ + if (is_ip4) + vlib_buffer_advance (b1, + sizeof (ip4_header_t) + + sizeof (udp_header_t)); + else + vlib_buffer_advance (b1, + sizeof (ip6_header_t) + + sizeof (udp_header_t)); + + exit1: + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + ip4_header_t *ip40; + ip6_header_t *ip60; + udp_header_t *udp0; + vxlan_header_t *vxlan0; + u32 bi0, ip_len0, udp_len0, flags0, next0; + i32 len_diff0; + u8 error0, good_udp0, proto0; + u32 stats_if0 = ~0; + + bi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + b0 = b[0]; + b++; + if (is_ip4) + ip40 = vlib_buffer_get_current (b0); + else + ip60 = vlib_buffer_get_current (b0); + + /* Setup packet for next IP feature */ + vnet_feature_next (&next0, b0); + + if (is_ip4) + /* Treat IP4 frag packets as "experimental" protocol for now + until support of IP frag reassembly is implemented */ + proto0 = ip4_is_fragment (ip40) ? 0xfe : ip40->protocol; + else + proto0 = ip60->protocol; + + if (proto0 != IP_PROTOCOL_UDP) + goto exit; /* not UDP packet */ + + if (is_ip4) + udp0 = ip4_next_header (ip40); + else + udp0 = ip6_next_header (ip60); + + u32 fi0 = vlib_buffer_get_ip_fib_index (b0, is_ip4); + vxlan0 = vlib_buffer_get_current (b0) + sizeof (udp_header_t) + + sizeof (ip4_header_t); + + vxlan_decap_info_t di0 = + is_ip4 ? + vxlan4_find_tunnel (vxm, &last4, fi0, ip40, vxlan0, &stats_if0) : + vxlan6_find_tunnel (vxm, &last6, fi0, ip60, vxlan0, &stats_if0); + + if (PREDICT_FALSE (di0.sw_if_index == ~0)) + goto exit; /* unknown interface */ + + /* Validate DIP against VTEPs */ + if (is_ip4) + { +#ifdef CLIB_HAVE_VEC512 + if (!vtep4_check_vector (&vxm->vtep_table, b0, ip40, &last_vtep4, + &vxm->vtep4_u512)) +#else + if (!vtep4_check (&vxm->vtep_table, b0, ip40, &last_vtep4)) +#endif + goto exit; /* no local VTEP for VXLAN packet */ + } + else + { + if (!vtep6_check (&vxm->vtep_table, b0, ip60, &last_vtep6)) + goto exit; /* no local VTEP for VXLAN packet */ + } + + flags0 = b0->flags; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_udp0 |= udp0->checksum == 0; + + /* Verify UDP length */ + if (is_ip4) + ip_len0 = clib_net_to_host_u16 (ip40->length); + else + ip_len0 = clib_net_to_host_u16 (ip60->payload_length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + len_diff0 = ip_len0 - udp_len0; + + /* Verify UDP checksum */ + if (PREDICT_FALSE (!good_udp0)) + { + if (is_ip4) + flags0 = ip4_tcp_udp_validate_checksum (vm, b0); + else + flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + } + + if (is_ip4) + { + error0 = good_udp0 ? 0 : IP4_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP4_ERROR_UDP_LENGTH; + } + else + { + error0 = good_udp0 ? 0 : IP6_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP6_ERROR_UDP_LENGTH; + } + + next0 = error0 ? + IP_VXLAN_BYPASS_NEXT_DROP : IP_VXLAN_BYPASS_NEXT_VXLAN; + b0->error = error0 ? error_node->errors[error0] : 0; + + /* vxlan-input node expect current at VXLAN header */ + if (is_ip4) + vlib_buffer_advance (b0, + sizeof (ip4_header_t) + + sizeof (udp_header_t)); + else + vlib_buffer_advance (b0, + sizeof (ip6_header_t) + + sizeof (udp_header_t)); + + exit: + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_NODE_FN (ip4_vxlan_bypass_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip_vxlan_bypass_inline (vm, node, frame, /* is_ip4 */ 1); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ip4_vxlan_bypass_node) = +{ + .name = "ip4-vxlan-bypass", + .vector_size = sizeof (u32), + .n_next_nodes = IP_VXLAN_BYPASS_N_NEXT, + .next_nodes = { + [IP_VXLAN_BYPASS_NEXT_DROP] = "error-drop", + [IP_VXLAN_BYPASS_NEXT_VXLAN] = "vxlan4-input", + }, + .format_buffer = format_ip4_header, + .format_trace = format_ip4_forward_next_trace, +}; + +/* *INDENT-ON* */ + +/* Dummy init function to get us linked in. */ +static clib_error_t * +ip4_vxlan_bypass_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (ip4_vxlan_bypass_init); + +VLIB_NODE_FN (ip6_vxlan_bypass_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip_vxlan_bypass_inline (vm, node, frame, /* is_ip4 */ 0); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (ip6_vxlan_bypass_node) = +{ + .name = "ip6-vxlan-bypass", + .vector_size = sizeof (u32), + .n_next_nodes = IP_VXLAN_BYPASS_N_NEXT, + .next_nodes = { + [IP_VXLAN_BYPASS_NEXT_DROP] = "error-drop", + [IP_VXLAN_BYPASS_NEXT_VXLAN] = "vxlan6-input", + }, + .format_buffer = format_ip6_header, + .format_trace = format_ip6_forward_next_trace, +}; + +/* *INDENT-ON* */ + +/* Dummy init function to get us linked in. */ +static clib_error_t * +ip6_vxlan_bypass_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (ip6_vxlan_bypass_init); + +#define foreach_vxlan_flow_input_next \ +_(DROP, "error-drop") \ +_(L2_INPUT, "l2-input") + +typedef enum +{ +#define _(s,n) VXLAN_FLOW_NEXT_##s, + foreach_vxlan_flow_input_next +#undef _ + VXLAN_FLOW_N_NEXT, +} vxlan_flow_input_next_t; + +#define foreach_vxlan_flow_error \ + _(NONE, "no error") \ + _(IP_CHECKSUM_ERROR, "Rx ip checksum errors") \ + _(IP_HEADER_ERROR, "Rx ip header errors") \ + _(UDP_CHECKSUM_ERROR, "Rx udp checksum errors") \ + _(UDP_LENGTH_ERROR, "Rx udp length errors") + +typedef enum +{ +#define _(f,s) VXLAN_FLOW_ERROR_##f, + foreach_vxlan_flow_error +#undef _ + VXLAN_FLOW_N_ERROR, +} vxlan_flow_error_t; + +static char *vxlan_flow_error_strings[] = { +#define _(n,s) s, + foreach_vxlan_flow_error +#undef _ +}; + + +static_always_inline u8 +vxlan_validate_udp_csum (vlib_main_t * vm, vlib_buffer_t * b) +{ + u32 flags = b->flags; + enum + { offset = + sizeof (ip4_header_t) + sizeof (udp_header_t) + sizeof (vxlan_header_t), + }; + + /* Verify UDP checksum */ + if ((flags & VNET_BUFFER_F_L4_CHECKSUM_COMPUTED) == 0) + { + vlib_buffer_advance (b, -offset); + flags = ip4_tcp_udp_validate_checksum (vm, b); + vlib_buffer_advance (b, offset); + } + + return (flags & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; +} + +static_always_inline u8 +vxlan_check_udp_csum (vlib_main_t * vm, vlib_buffer_t * b) +{ + ip4_vxlan_header_t *hdr = vlib_buffer_get_current (b) - sizeof *hdr; + udp_header_t *udp = &hdr->udp; + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + u8 good_csum = (b->flags & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0 || + udp->checksum == 0; + + return !good_csum; +} + +static_always_inline u8 +vxlan_check_ip (vlib_buffer_t * b, u16 payload_len) +{ + ip4_vxlan_header_t *hdr = vlib_buffer_get_current (b) - sizeof *hdr; + u16 ip_len = clib_net_to_host_u16 (hdr->ip4.length); + u16 expected = payload_len + sizeof *hdr; + return ip_len > expected || hdr->ip4.ttl == 0 + || hdr->ip4.ip_version_and_header_length != 0x45; +} + +static_always_inline u8 +vxlan_check_ip_udp_len (vlib_buffer_t * b) +{ + ip4_vxlan_header_t *hdr = vlib_buffer_get_current (b) - sizeof *hdr; + u16 ip_len = clib_net_to_host_u16 (hdr->ip4.length); + u16 udp_len = clib_net_to_host_u16 (hdr->udp.length); + return udp_len > ip_len; +} + +static_always_inline u8 +vxlan_err_code (u8 ip_err0, u8 udp_err0, u8 csum_err0) +{ + u8 error0 = VXLAN_FLOW_ERROR_NONE; + if (ip_err0) + error0 = VXLAN_FLOW_ERROR_IP_HEADER_ERROR; + if (udp_err0) + error0 = VXLAN_FLOW_ERROR_UDP_LENGTH_ERROR; + if (csum_err0) + error0 = VXLAN_FLOW_ERROR_UDP_CHECKSUM_ERROR; + return error0; +} + +VLIB_NODE_FN (vxlan4_flow_input_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * f) +{ + enum + { payload_offset = sizeof (ip4_vxlan_header_t) }; + + vxlan_main_t *vxm = &vxlan_main; + vnet_interface_main_t *im = &vnet_main.interface_main; + vlib_combined_counter_main_t *rx_counter[VXLAN_FLOW_N_NEXT] = { + [VXLAN_FLOW_NEXT_DROP] = + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_DROP, + [VXLAN_FLOW_NEXT_L2_INPUT] = + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_RX, + }; + u32 thread_index = vlib_get_thread_index (); + + u32 *from = vlib_frame_vector_args (f); + u32 n_left_from = f->n_vectors; + u32 next_index = VXLAN_FLOW_NEXT_L2_INPUT; + + while (n_left_from > 0) + { + u32 n_left_to_next, *to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 3 && n_left_to_next > 3) + { + u32 bi0 = to_next[0] = from[0]; + u32 bi1 = to_next[1] = from[1]; + u32 bi2 = to_next[2] = from[2]; + u32 bi3 = to_next[3] = from[3]; + from += 4; + n_left_from -= 4; + to_next += 4; + n_left_to_next -= 4; + + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + vlib_buffer_t *b1 = vlib_get_buffer (vm, bi1); + vlib_buffer_t *b2 = vlib_get_buffer (vm, bi2); + vlib_buffer_t *b3 = vlib_get_buffer (vm, bi3); + + vlib_buffer_advance (b0, payload_offset); + vlib_buffer_advance (b1, payload_offset); + vlib_buffer_advance (b2, payload_offset); + vlib_buffer_advance (b3, payload_offset); + + u16 len0 = vlib_buffer_length_in_chain (vm, b0); + u16 len1 = vlib_buffer_length_in_chain (vm, b1); + u16 len2 = vlib_buffer_length_in_chain (vm, b2); + u16 len3 = vlib_buffer_length_in_chain (vm, b3); + + u32 next0 = VXLAN_FLOW_NEXT_L2_INPUT, next1 = + VXLAN_FLOW_NEXT_L2_INPUT, next2 = + VXLAN_FLOW_NEXT_L2_INPUT, next3 = VXLAN_FLOW_NEXT_L2_INPUT; + + u8 ip_err0 = vxlan_check_ip (b0, len0); + u8 ip_err1 = vxlan_check_ip (b1, len1); + u8 ip_err2 = vxlan_check_ip (b2, len2); + u8 ip_err3 = vxlan_check_ip (b3, len3); + u8 ip_err = ip_err0 | ip_err1 | ip_err2 | ip_err3; + + u8 udp_err0 = vxlan_check_ip_udp_len (b0); + u8 udp_err1 = vxlan_check_ip_udp_len (b1); + u8 udp_err2 = vxlan_check_ip_udp_len (b2); + u8 udp_err3 = vxlan_check_ip_udp_len (b3); + u8 udp_err = udp_err0 | udp_err1 | udp_err2 | udp_err3; + + u8 csum_err0 = vxlan_check_udp_csum (vm, b0); + u8 csum_err1 = vxlan_check_udp_csum (vm, b1); + u8 csum_err2 = vxlan_check_udp_csum (vm, b2); + u8 csum_err3 = vxlan_check_udp_csum (vm, b3); + u8 csum_err = csum_err0 | csum_err1 | csum_err2 | csum_err3; + + if (PREDICT_FALSE (csum_err)) + { + if (csum_err0) + csum_err0 = !vxlan_validate_udp_csum (vm, b0); + if (csum_err1) + csum_err1 = !vxlan_validate_udp_csum (vm, b1); + if (csum_err2) + csum_err2 = !vxlan_validate_udp_csum (vm, b2); + if (csum_err3) + csum_err3 = !vxlan_validate_udp_csum (vm, b3); + csum_err = csum_err0 | csum_err1 | csum_err2 | csum_err3; + } + + if (PREDICT_FALSE (ip_err || udp_err || csum_err)) + { + if (ip_err0 || udp_err0 || csum_err0) + { + next0 = VXLAN_FLOW_NEXT_DROP; + u8 error0 = vxlan_err_code (ip_err0, udp_err0, csum_err0); + b0->error = node->errors[error0]; + } + if (ip_err1 || udp_err1 || csum_err1) + { + next1 = VXLAN_FLOW_NEXT_DROP; + u8 error1 = vxlan_err_code (ip_err1, udp_err1, csum_err1); + b1->error = node->errors[error1]; + } + if (ip_err2 || udp_err2 || csum_err2) + { + next2 = VXLAN_FLOW_NEXT_DROP; + u8 error2 = vxlan_err_code (ip_err2, udp_err2, csum_err2); + b2->error = node->errors[error2]; + } + if (ip_err3 || udp_err3 || csum_err3) + { + next3 = VXLAN_FLOW_NEXT_DROP; + u8 error3 = vxlan_err_code (ip_err3, udp_err3, csum_err3); + b3->error = node->errors[error3]; + } + } + + vnet_update_l2_len (b0); + vnet_update_l2_len (b1); + vnet_update_l2_len (b2); + vnet_update_l2_len (b3); + + ASSERT (b0->flow_id != 0); + ASSERT (b1->flow_id != 0); + ASSERT (b2->flow_id != 0); + ASSERT (b3->flow_id != 0); + + u32 t_index0 = b0->flow_id - vxm->flow_id_start; + u32 t_index1 = b1->flow_id - vxm->flow_id_start; + u32 t_index2 = b2->flow_id - vxm->flow_id_start; + u32 t_index3 = b3->flow_id - vxm->flow_id_start; + + vxlan_tunnel_t *t0 = &vxm->tunnels[t_index0]; + vxlan_tunnel_t *t1 = &vxm->tunnels[t_index1]; + vxlan_tunnel_t *t2 = &vxm->tunnels[t_index2]; + vxlan_tunnel_t *t3 = &vxm->tunnels[t_index3]; + + /* flow id consumed */ + b0->flow_id = 0; + b1->flow_id = 0; + b2->flow_id = 0; + b3->flow_id = 0; + + u32 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX] = + t0->sw_if_index; + u32 sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX] = + t1->sw_if_index; + u32 sw_if_index2 = vnet_buffer (b2)->sw_if_index[VLIB_RX] = + t2->sw_if_index; + u32 sw_if_index3 = vnet_buffer (b3)->sw_if_index[VLIB_RX] = + t3->sw_if_index; + + vlib_increment_combined_counter (rx_counter[next0], thread_index, + sw_if_index0, 1, len0); + vlib_increment_combined_counter (rx_counter[next1], thread_index, + sw_if_index1, 1, len1); + vlib_increment_combined_counter (rx_counter[next2], thread_index, + sw_if_index2, 1, len2); + vlib_increment_combined_counter (rx_counter[next3], thread_index, + sw_if_index3, 1, len3); + + u32 flags = b0->flags | b1->flags | b2->flags | b3->flags; + + if (PREDICT_FALSE (flags & VLIB_BUFFER_IS_TRACED)) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + vxlan_rx_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof *tr); + u8 error0 = vxlan_err_code (ip_err0, udp_err0, csum_err0); + tr->next_index = next0; + tr->error = error0; + tr->tunnel_index = t_index0; + tr->vni = t0->vni; + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + vxlan_rx_trace_t *tr = + vlib_add_trace (vm, node, b1, sizeof *tr); + u8 error1 = vxlan_err_code (ip_err1, udp_err1, csum_err1); + tr->next_index = next1; + tr->error = error1; + tr->tunnel_index = t_index1; + tr->vni = t1->vni; + } + if (b2->flags & VLIB_BUFFER_IS_TRACED) + { + vxlan_rx_trace_t *tr = + vlib_add_trace (vm, node, b2, sizeof *tr); + u8 error2 = vxlan_err_code (ip_err2, udp_err2, csum_err2); + tr->next_index = next2; + tr->error = error2; + tr->tunnel_index = t_index2; + tr->vni = t2->vni; + } + if (b3->flags & VLIB_BUFFER_IS_TRACED) + { + vxlan_rx_trace_t *tr = + vlib_add_trace (vm, node, b3, sizeof *tr); + u8 error3 = vxlan_err_code (ip_err3, udp_err3, csum_err3); + tr->next_index = next3; + tr->error = error3; + tr->tunnel_index = t_index3; + tr->vni = t3->vni; + } + } + vlib_validate_buffer_enqueue_x4 + (vm, node, next_index, to_next, n_left_to_next, + bi0, bi1, bi2, bi3, next0, next1, next2, next3); + } + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0 = to_next[0] = from[0]; + from++; + n_left_from--; + to_next++; + n_left_to_next--; + + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + vlib_buffer_advance (b0, payload_offset); + + u16 len0 = vlib_buffer_length_in_chain (vm, b0); + u32 next0 = VXLAN_FLOW_NEXT_L2_INPUT; + + u8 ip_err0 = vxlan_check_ip (b0, len0); + u8 udp_err0 = vxlan_check_ip_udp_len (b0); + u8 csum_err0 = vxlan_check_udp_csum (vm, b0); + + if (csum_err0) + csum_err0 = !vxlan_validate_udp_csum (vm, b0); + if (ip_err0 || udp_err0 || csum_err0) + { + next0 = VXLAN_FLOW_NEXT_DROP; + u8 error0 = vxlan_err_code (ip_err0, udp_err0, csum_err0); + b0->error = node->errors[error0]; + } + + vnet_update_l2_len (b0); + + ASSERT (b0->flow_id != 0); + u32 t_index0 = b0->flow_id - vxm->flow_id_start; + vxlan_tunnel_t *t0 = &vxm->tunnels[t_index0]; + b0->flow_id = 0; + + u32 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX] = + t0->sw_if_index; + vlib_increment_combined_counter (rx_counter[next0], thread_index, + sw_if_index0, 1, len0); + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_rx_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof *tr); + u8 error0 = vxlan_err_code (ip_err0, udp_err0, csum_err0); + tr->next_index = next0; + tr->error = error0; + tr->tunnel_index = t_index0; + tr->vni = t0->vni; + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return f->n_vectors; +} + +/* *INDENT-OFF* */ +#ifndef CLIB_MULTIARCH_VARIANT +VLIB_REGISTER_NODE (vxlan4_flow_input_node) = { + .name = "vxlan-flow-input", + .type = VLIB_NODE_TYPE_INTERNAL, + .vector_size = sizeof (u32), + + .format_trace = format_vxlan_rx_trace, + + .n_errors = VXLAN_FLOW_N_ERROR, + .error_strings = vxlan_flow_error_strings, + + .n_next_nodes = VXLAN_FLOW_N_NEXT, + .next_nodes = { +#define _(s,n) [VXLAN_FLOW_NEXT_##s] = n, + foreach_vxlan_flow_input_next +#undef _ + }, +}; +#endif +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/vxlan/dir.dox b/src/plugins/vxlan/dir.dox new file mode 100644 index 00000000000..31a9e2b6112 --- /dev/null +++ b/src/plugins/vxlan/dir.dox @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2016 Cisco and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** +@dir +@brief VXLAN Code. + +This directory contains source code to support VXLAN. + +*/ +/*? %%clicmd:group_label VXLAN CLI %% ?*/ diff --git a/src/plugins/vxlan/encap.c b/src/plugins/vxlan/encap.c new file mode 100644 index 00000000000..e4103ec0083 --- /dev/null +++ b/src/plugins/vxlan/encap.c @@ -0,0 +1,540 @@ + +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vppinfra/error.h> +#include <vppinfra/hash.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/interface_output.h> +#include <vxlan/vxlan.h> +#include <vnet/qos/qos_types.h> +#include <vnet/adj/rewrite.h> + +/* Statistics (not all errors) */ +#define foreach_vxlan_encap_error \ +_(ENCAPSULATED, "good packets encapsulated") + +static char *vxlan_encap_error_strings[] = { +#define _(sym,string) string, + foreach_vxlan_encap_error +#undef _ +}; + +typedef enum +{ +#define _(sym,str) VXLAN_ENCAP_ERROR_##sym, + foreach_vxlan_encap_error +#undef _ + VXLAN_ENCAP_N_ERROR, +} vxlan_encap_error_t; + +typedef enum +{ + VXLAN_ENCAP_NEXT_DROP, + VXLAN_ENCAP_N_NEXT, +} vxlan_encap_next_t; + +typedef struct +{ + u32 tunnel_index; + u32 vni; +} vxlan_encap_trace_t; + +#ifndef CLIB_MARCH_VARIANT +u8 * +format_vxlan_encap_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + vxlan_encap_trace_t *t = va_arg (*args, vxlan_encap_trace_t *); + + s = format (s, "VXLAN encap to vxlan_tunnel%d vni %d", + t->tunnel_index, t->vni); + return s; +} +#endif + +always_inline uword +vxlan_encap_inline (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *from_frame, u8 is_ip4) +{ + u32 n_left_from, next_index, *from, *to_next; + vxlan_main_t *vxm = &vxlan_main; + vnet_main_t *vnm = vxm->vnet_main; + vnet_interface_main_t *im = &vnm->interface_main; + vlib_combined_counter_main_t *tx_counter = + im->combined_sw_if_counters + VNET_INTERFACE_COUNTER_TX; + u32 pkts_encapsulated = 0; + u32 thread_index = vlib_get_thread_index (); + u32 sw_if_index0 = 0, sw_if_index1 = 0; + u32 next0 = 0, next1 = 0; + vxlan_tunnel_t *t0 = NULL, *t1 = NULL; + index_t dpoi_idx0 = INDEX_INVALID, dpoi_idx1 = INDEX_INVALID; + vlib_buffer_t *bufs[VLIB_FRAME_SIZE]; + vlib_buffer_t **b = bufs; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + + STATIC_ASSERT_SIZEOF (ip6_vxlan_header_t, 56); + STATIC_ASSERT_SIZEOF (ip4_vxlan_header_t, 36); + + u8 const underlay_hdr_len = is_ip4 ? + sizeof (ip4_vxlan_header_t) : sizeof (ip6_vxlan_header_t); + u16 const l3_len = is_ip4 ? sizeof (ip4_header_t) : sizeof (ip6_header_t); + u32 const outer_packet_csum_offload_flags = + is_ip4 ? (VNET_BUFFER_OFFLOAD_F_OUTER_IP_CKSUM | + VNET_BUFFER_OFFLOAD_F_TNL_VXLAN) : + (VNET_BUFFER_OFFLOAD_F_OUTER_UDP_CKSUM | + VNET_BUFFER_OFFLOAD_F_TNL_VXLAN); + + vlib_get_buffers (vm, from, bufs, n_left_from); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + /* Prefetch next iteration. */ + { + vlib_prefetch_buffer_header (b[2], LOAD); + vlib_prefetch_buffer_header (b[3], LOAD); + + CLIB_PREFETCH (b[2]->data - CLIB_CACHE_LINE_BYTES, + 2 * CLIB_CACHE_LINE_BYTES, LOAD); + CLIB_PREFETCH (b[3]->data - CLIB_CACHE_LINE_BYTES, + 2 * CLIB_CACHE_LINE_BYTES, LOAD); + } + + u32 bi0 = to_next[0] = from[0]; + u32 bi1 = to_next[1] = from[1]; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + vlib_buffer_t *b0 = b[0]; + vlib_buffer_t *b1 = b[1]; + b += 2; + + u32 flow_hash0 = vnet_l2_compute_flow_hash (b0); + u32 flow_hash1 = vnet_l2_compute_flow_hash (b1); + + /* Get next node index and adj index from tunnel next_dpo */ + if (sw_if_index0 != vnet_buffer (b0)->sw_if_index[VLIB_TX]) + { + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_TX]; + vnet_hw_interface_t *hi0 = + vnet_get_sup_hw_interface (vnm, sw_if_index0); + t0 = &vxm->tunnels[hi0->dev_instance]; + /* Note: change to always set next0 if it may set to drop */ + next0 = t0->next_dpo.dpoi_next_node; + dpoi_idx0 = t0->next_dpo.dpoi_index; + } + + /* Get next node index and adj index from tunnel next_dpo */ + if (sw_if_index1 != vnet_buffer (b1)->sw_if_index[VLIB_TX]) + { + if (sw_if_index0 == vnet_buffer (b1)->sw_if_index[VLIB_TX]) + { + sw_if_index1 = sw_if_index0; + t1 = t0; + next1 = next0; + dpoi_idx1 = dpoi_idx0; + } + else + { + sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_TX]; + vnet_hw_interface_t *hi1 = + vnet_get_sup_hw_interface (vnm, sw_if_index1); + t1 = &vxm->tunnels[hi1->dev_instance]; + /* Note: change to always set next1 if it may set to drop */ + next1 = t1->next_dpo.dpoi_next_node; + dpoi_idx1 = t1->next_dpo.dpoi_index; + } + } + + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpoi_idx0; + vnet_buffer (b1)->ip.adj_index[VLIB_TX] = dpoi_idx1; + + ASSERT (t0->rewrite_header.data_bytes == underlay_hdr_len); + ASSERT (t1->rewrite_header.data_bytes == underlay_hdr_len); + vnet_rewrite_two_headers (*t0, *t1, vlib_buffer_get_current (b0), + vlib_buffer_get_current (b1), + underlay_hdr_len); + + vlib_buffer_advance (b0, -underlay_hdr_len); + vlib_buffer_advance (b1, -underlay_hdr_len); + + u32 len0 = vlib_buffer_length_in_chain (vm, b0); + u32 len1 = vlib_buffer_length_in_chain (vm, b1); + u16 payload_l0 = clib_host_to_net_u16 (len0 - l3_len); + u16 payload_l1 = clib_host_to_net_u16 (len1 - l3_len); + + void *underlay0 = vlib_buffer_get_current (b0); + void *underlay1 = vlib_buffer_get_current (b1); + + ip4_header_t *ip4_0, *ip4_1; + qos_bits_t ip4_0_tos = 0, ip4_1_tos = 0; + ip6_header_t *ip6_0, *ip6_1; + udp_header_t *udp0, *udp1; + u8 *l3_0, *l3_1; + if (is_ip4) + { + ip4_vxlan_header_t *hdr0 = underlay0; + ip4_vxlan_header_t *hdr1 = underlay1; + + /* Fix the IP4 checksum and length */ + ip4_0 = &hdr0->ip4; + ip4_1 = &hdr1->ip4; + ip4_0->length = clib_host_to_net_u16 (len0); + ip4_1->length = clib_host_to_net_u16 (len1); + + if (PREDICT_FALSE (b0->flags & VNET_BUFFER_F_QOS_DATA_VALID)) + { + ip4_0_tos = vnet_buffer2 (b0)->qos.bits; + ip4_0->tos = ip4_0_tos; + } + if (PREDICT_FALSE (b1->flags & VNET_BUFFER_F_QOS_DATA_VALID)) + { + ip4_1_tos = vnet_buffer2 (b1)->qos.bits; + ip4_1->tos = ip4_1_tos; + } + + l3_0 = (u8 *) ip4_0; + l3_1 = (u8 *) ip4_1; + udp0 = &hdr0->udp; + udp1 = &hdr1->udp; + } + else /* ipv6 */ + { + ip6_vxlan_header_t *hdr0 = underlay0; + ip6_vxlan_header_t *hdr1 = underlay1; + + /* Fix IP6 payload length */ + ip6_0 = &hdr0->ip6; + ip6_1 = &hdr1->ip6; + ip6_0->payload_length = payload_l0; + ip6_1->payload_length = payload_l1; + + l3_0 = (u8 *) ip6_0; + l3_1 = (u8 *) ip6_1; + udp0 = &hdr0->udp; + udp1 = &hdr1->udp; + } + + /* Fix UDP length and set source port */ + udp0->length = payload_l0; + udp0->src_port = flow_hash0; + udp1->length = payload_l1; + udp1->src_port = flow_hash1; + + if (b0->flags & VNET_BUFFER_F_OFFLOAD) + { + vnet_buffer2 (b0)->outer_l3_hdr_offset = l3_0 - b0->data; + vnet_buffer2 (b0)->outer_l4_hdr_offset = (u8 *) udp0 - b0->data; + vnet_buffer_offload_flags_set (b0, + outer_packet_csum_offload_flags); + } + /* IPv4 checksum only */ + else if (is_ip4) + { + ip_csum_t sum0 = ip4_0->checksum; + sum0 = ip_csum_update (sum0, 0, ip4_0->length, ip4_header_t, + length /* changed member */); + if (PREDICT_FALSE (ip4_0_tos)) + { + sum0 = ip_csum_update (sum0, 0, ip4_0_tos, ip4_header_t, + tos /* changed member */); + } + ip4_0->checksum = ip_csum_fold (sum0); + } + /* IPv6 UDP checksum is mandatory */ + else + { + int bogus = 0; + + udp0->checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip6_0, &bogus); + ASSERT (bogus == 0); + if (udp0->checksum == 0) + udp0->checksum = 0xffff; + } + + if (b1->flags & VNET_BUFFER_F_OFFLOAD) + { + vnet_buffer2 (b1)->outer_l3_hdr_offset = l3_1 - b1->data; + vnet_buffer2 (b1)->outer_l4_hdr_offset = (u8 *) udp1 - b1->data; + vnet_buffer_offload_flags_set (b1, + outer_packet_csum_offload_flags); + } + /* IPv4 checksum only */ + else if (is_ip4) + { + ip_csum_t sum1 = ip4_1->checksum; + sum1 = ip_csum_update (sum1, 0, ip4_1->length, ip4_header_t, + length /* changed member */); + if (PREDICT_FALSE (ip4_1_tos)) + { + sum1 = ip_csum_update (sum1, 0, ip4_1_tos, ip4_header_t, + tos /* changed member */); + } + ip4_1->checksum = ip_csum_fold (sum1); + } + /* IPv6 UDP checksum is mandatory */ + else + { + int bogus = 0; + + udp1->checksum = ip6_tcp_udp_icmp_compute_checksum + (vm, b1, ip6_1, &bogus); + ASSERT (bogus == 0); + if (udp1->checksum == 0) + udp1->checksum = 0xffff; + } + + /* save inner packet flow_hash for load-balance node */ + vnet_buffer (b0)->ip.flow_hash = flow_hash0; + vnet_buffer (b1)->ip.flow_hash = flow_hash1; + + if (sw_if_index0 == sw_if_index1) + { + vlib_increment_combined_counter (tx_counter, thread_index, + sw_if_index0, 2, len0 + len1); + } + else + { + vlib_increment_combined_counter (tx_counter, thread_index, + sw_if_index0, 1, len0); + vlib_increment_combined_counter (tx_counter, thread_index, + sw_if_index1, 1, len1); + } + pkts_encapsulated += 2; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_encap_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->tunnel_index = t0 - vxm->tunnels; + tr->vni = t0->vni; + } + + if (PREDICT_FALSE (b1->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_encap_trace_t *tr = + vlib_add_trace (vm, node, b1, sizeof (*tr)); + tr->tunnel_index = t1 - vxm->tunnels; + tr->vni = t1->vni; + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0 = to_next[0] = from[0]; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + vlib_buffer_t *b0 = b[0]; + b += 1; + + u32 flow_hash0 = vnet_l2_compute_flow_hash (b0); + + /* Get next node index and adj index from tunnel next_dpo */ + if (sw_if_index0 != vnet_buffer (b0)->sw_if_index[VLIB_TX]) + { + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_TX]; + vnet_hw_interface_t *hi0 = + vnet_get_sup_hw_interface (vnm, sw_if_index0); + t0 = &vxm->tunnels[hi0->dev_instance]; + /* Note: change to always set next0 if it may be set to drop */ + next0 = t0->next_dpo.dpoi_next_node; + dpoi_idx0 = t0->next_dpo.dpoi_index; + } + vnet_buffer (b0)->ip.adj_index[VLIB_TX] = dpoi_idx0; + + ASSERT (t0->rewrite_header.data_bytes == underlay_hdr_len); + vnet_rewrite_one_header (*t0, vlib_buffer_get_current (b0), + underlay_hdr_len); + + vlib_buffer_advance (b0, -underlay_hdr_len); + void *underlay0 = vlib_buffer_get_current (b0); + + u32 len0 = vlib_buffer_length_in_chain (vm, b0); + u16 payload_l0 = clib_host_to_net_u16 (len0 - l3_len); + + udp_header_t *udp0; + ip4_header_t *ip4_0; + qos_bits_t ip4_0_tos = 0; + ip6_header_t *ip6_0; + u8 *l3_0; + if (is_ip4) + { + ip4_vxlan_header_t *hdr = underlay0; + + /* Fix the IP4 checksum and length */ + ip4_0 = &hdr->ip4; + ip4_0->length = clib_host_to_net_u16 (len0); + + if (PREDICT_FALSE (b0->flags & VNET_BUFFER_F_QOS_DATA_VALID)) + { + ip4_0_tos = vnet_buffer2 (b0)->qos.bits; + ip4_0->tos = ip4_0_tos; + } + + l3_0 = (u8 *) ip4_0; + udp0 = &hdr->udp; + } + else /* ip6 path */ + { + ip6_vxlan_header_t *hdr = underlay0; + + /* Fix IP6 payload length */ + ip6_0 = &hdr->ip6; + ip6_0->payload_length = payload_l0; + + l3_0 = (u8 *) ip6_0; + udp0 = &hdr->udp; + } + + /* Fix UDP length and set source port */ + udp0->length = payload_l0; + udp0->src_port = flow_hash0; + + if (b0->flags & VNET_BUFFER_F_OFFLOAD) + { + vnet_buffer2 (b0)->outer_l3_hdr_offset = l3_0 - b0->data; + vnet_buffer2 (b0)->outer_l4_hdr_offset = (u8 *) udp0 - b0->data; + vnet_buffer_offload_flags_set (b0, + outer_packet_csum_offload_flags); + } + /* IPv4 checksum only */ + else if (is_ip4) + { + ip_csum_t sum0 = ip4_0->checksum; + sum0 = ip_csum_update (sum0, 0, ip4_0->length, ip4_header_t, + length /* changed member */); + if (PREDICT_FALSE (ip4_0_tos)) + { + sum0 = ip_csum_update (sum0, 0, ip4_0_tos, ip4_header_t, + tos /* changed member */); + } + ip4_0->checksum = ip_csum_fold (sum0); + } + /* IPv6 UDP checksum is mandatory */ + else + { + int bogus = 0; + + udp0->checksum = ip6_tcp_udp_icmp_compute_checksum + (vm, b0, ip6_0, &bogus); + ASSERT (bogus == 0); + if (udp0->checksum == 0) + udp0->checksum = 0xffff; + } + + /* reuse inner packet flow_hash for load-balance node */ + vnet_buffer (b0)->ip.flow_hash = flow_hash0; + + vlib_increment_combined_counter (tx_counter, thread_index, + sw_if_index0, 1, len0); + pkts_encapsulated++; + + if (PREDICT_FALSE (b0->flags & VLIB_BUFFER_IS_TRACED)) + { + vxlan_encap_trace_t *tr = + vlib_add_trace (vm, node, b0, sizeof (*tr)); + tr->tunnel_index = t0 - vxm->tunnels; + tr->vni = t0->vni; + } + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* Do we still need this now that tunnel tx stats is kept? */ + vlib_node_increment_counter (vm, node->node_index, + VXLAN_ENCAP_ERROR_ENCAPSULATED, + pkts_encapsulated); + + return from_frame->n_vectors; +} + +VLIB_NODE_FN (vxlan4_encap_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + /* Disable chksum offload as setup overhead in tx node is not worthwhile + for ip4 header checksum only, unless udp checksum is also required */ + return vxlan_encap_inline (vm, node, from_frame, /* is_ip4 */ 1); +} + +VLIB_NODE_FN (vxlan6_encap_node) (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + /* Enable checksum offload for ip6 as udp checksum is mandatory, */ + return vxlan_encap_inline (vm, node, from_frame, /* is_ip4 */ 0); +} + +/* *INDENT-OFF* */ +VLIB_REGISTER_NODE (vxlan4_encap_node) = { + .name = "vxlan4-encap", + .vector_size = sizeof (u32), + .format_trace = format_vxlan_encap_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(vxlan_encap_error_strings), + .error_strings = vxlan_encap_error_strings, + .n_next_nodes = VXLAN_ENCAP_N_NEXT, + .next_nodes = { + [VXLAN_ENCAP_NEXT_DROP] = "error-drop", + }, +}; + +VLIB_REGISTER_NODE (vxlan6_encap_node) = { + .name = "vxlan6-encap", + .vector_size = sizeof (u32), + .format_trace = format_vxlan_encap_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(vxlan_encap_error_strings), + .error_strings = vxlan_encap_error_strings, + .n_next_nodes = VXLAN_ENCAP_N_NEXT, + .next_nodes = { + [VXLAN_ENCAP_NEXT_DROP] = "error-drop", + }, +}; +/* *INDENT-ON* */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/vxlan/plugin.c b/src/plugins/vxlan/plugin.c new file mode 100644 index 00000000000..aaeafb03a3b --- /dev/null +++ b/src/plugins/vxlan/plugin.c @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2022 Cisco Systems, Inc. + * License: Cisco Proprietary Closed Source License - Cisco Internal. + * The software, documentation and any fonts accompanying this License whether + * on disk, in read only memory, on any other media or in any other form (col- + * lectively the “Software”) are licensed, not sold, to you by Cisco, Inc. + * (“Cisco”) for use only under the terms of this License, and Cisco reserves + * all rights not expressly granted to you. The rights granted herein are + * limited to Cisco’s intel- lectual property rights in the Cisco Software and + * do not include any other patents or intellectual property rights. You own + * the media on which the Cisco Software is recorded but Cisco and/or Cisco’s + * licensor(s) retain ownership of the Software itself. + */ + +#include <vlib/vlib.h> +#include <vnet/plugin/plugin.h> +#include <vpp/app/version.h> + +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "VxLan Tunnels", +}; diff --git a/src/plugins/vxlan/vxlan.api b/src/plugins/vxlan/vxlan.api new file mode 100644 index 00000000000..b7e678595d8 --- /dev/null +++ b/src/plugins/vxlan/vxlan.api @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2015-2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +option version = "2.1.0"; + +import "vnet/interface_types.api"; +import "vnet/ip/ip_types.api"; + +/** \brief Create or delete a VXLAN tunnel + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - Use 1 to create the tunnel, 0 to remove it + @param instance - optional unique custom device instance, else ~0. + @param src_address - Source IP address + @param dst_address - Destination IP address, can be multicast + @param mcast_sw_if_index - Interface for multicast destination + @param encap_vrf_id - Encap route table FIB index + @param decap_next_index - index of decap next graph node + @param vni - The VXLAN Network Identifier, uint24 +*/ +define vxlan_add_del_tunnel +{ + u32 client_index; + u32 context; + bool is_add [default=true]; + u32 instance; /* If non-~0, specifies a custom dev instance */ + vl_api_address_t src_address; + vl_api_address_t dst_address; + vl_api_interface_index_t mcast_sw_if_index; + u32 encap_vrf_id; + u32 decap_next_index; + u32 vni; +}; + +/** \brief Create or delete a VXLAN tunnel + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - Use 1 to create the tunnel, 0 to remove it + @param instance - optional unique custom device instance, else ~0. + @param src_address - Source IP address + @param dst_address - Destination IP address, can be multicast + @param src_port - Source UDP port. It is not included in sent packets. Used only for port registration + @param dst_port - Destination UDP port + @param mcast_sw_if_index - Interface for multicast destination + @param encap_vrf_id - Encap route table FIB index + @param decap_next_index - index of decap next graph node + @param vni - The VXLAN Network Identifier, uint24 +*/ +define vxlan_add_del_tunnel_v2 +{ + u32 client_index; + u32 context; + bool is_add [default=true]; + u32 instance [default=0xffffffff]; /* If non-~0, specifies a custom dev instance */ + vl_api_address_t src_address; + vl_api_address_t dst_address; + u16 src_port; + u16 dst_port; + vl_api_interface_index_t mcast_sw_if_index; + u32 encap_vrf_id; + u32 decap_next_index; + u32 vni; +}; + +/** \brief Create or delete a VXLAN tunnel + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param is_add - Use 1 to create the tunnel, 0 to remove it + @param instance - optional unique custom device instance, else ~0. + @param src_address - Source IP address + @param dst_address - Destination IP address, can be multicast + @param src_port - Source UDP port. It is not included in sent packets. Used only for port registration + @param dst_port - Destination UDP port + @param mcast_sw_if_index - Interface for multicast destination + @param encap_vrf_id - Encap route table FIB index + @param decap_next_index - index of decap next graph node + @param vni - The VXLAN Network Identifier, uint24 + @param is_l3 - if true, create the interface in L3 mode, w/o MAC +*/ +define vxlan_add_del_tunnel_v3 +{ + u32 client_index; + u32 context; + bool is_add [default=true]; + u32 instance [default=0xffffffff]; /* If non-~0, specifies a custom dev instance */ + vl_api_address_t src_address; + vl_api_address_t dst_address; + u16 src_port; + u16 dst_port; + vl_api_interface_index_t mcast_sw_if_index; + u32 encap_vrf_id; + u32 decap_next_index; + u32 vni; + bool is_l3 [default=false]; +}; + +define vxlan_add_del_tunnel_reply +{ + u32 context; + i32 retval; + vl_api_interface_index_t sw_if_index; +}; +define vxlan_add_del_tunnel_v2_reply +{ + u32 context; + i32 retval; + vl_api_interface_index_t sw_if_index; +}; +define vxlan_add_del_tunnel_v3_reply +{ + u32 context; + i32 retval; + vl_api_interface_index_t sw_if_index; +}; + +define vxlan_tunnel_dump +{ + u32 client_index; + u32 context; + vl_api_interface_index_t sw_if_index; +}; +define vxlan_tunnel_v2_dump +{ + u32 client_index; + u32 context; + vl_api_interface_index_t sw_if_index; +}; + +define vxlan_tunnel_details +{ + u32 context; + vl_api_interface_index_t sw_if_index; + u32 instance; + vl_api_address_t src_address; + vl_api_address_t dst_address; + vl_api_interface_index_t mcast_sw_if_index; + u32 encap_vrf_id; + u32 decap_next_index; + u32 vni; +}; +define vxlan_tunnel_v2_details +{ + u32 context; + vl_api_interface_index_t sw_if_index; + u32 instance; + vl_api_address_t src_address; + vl_api_address_t dst_address; + u16 src_port; + u16 dst_port; + vl_api_interface_index_t mcast_sw_if_index; + u32 encap_vrf_id; + u32 decap_next_index; + u32 vni; +}; + +/** \brief Interface set vxlan-bypass request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param sw_if_index - interface used to reach neighbor + @param is_ipv6 - if non-zero, enable ipv6-vxlan-bypass, else ipv4-vxlan-bypass + @param enable - if non-zero enable, else disable +*/ +autoreply define sw_interface_set_vxlan_bypass +{ + u32 client_index; + u32 context; + vl_api_interface_index_t sw_if_index; + bool is_ipv6; + bool enable [default=true]; +}; + +/** \brief Offload vxlan rx request + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param hw_if_index - rx hw interface + @param sw_if_index - vxlan interface to offload + @param enable - if non-zero enable, else disable +*/ +autoreply define vxlan_offload_rx +{ + u32 client_index; + u32 context; + vl_api_interface_index_t hw_if_index; + vl_api_interface_index_t sw_if_index; + bool enable [default=true]; +}; diff --git a/src/plugins/vxlan/vxlan.c b/src/plugins/vxlan/vxlan.c new file mode 100644 index 00000000000..f1ab7a7cb8b --- /dev/null +++ b/src/plugins/vxlan/vxlan.c @@ -0,0 +1,1349 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vxlan/vxlan.h> +#include <vnet/ip/format.h> +#include <vnet/fib/fib_entry.h> +#include <vnet/fib/fib_table.h> +#include <vnet/fib/fib_entry_track.h> +#include <vnet/mfib/mfib_table.h> +#include <vnet/adj/adj_mcast.h> +#include <vnet/adj/rewrite.h> +#include <vnet/dpo/drop_dpo.h> +#include <vnet/interface.h> +#include <vnet/flow/flow.h> +#include <vnet/udp/udp_local.h> +#include <vlib/vlib.h> + +/** + * @file + * @brief VXLAN. + * + * VXLAN provides the features needed to allow L2 bridge domains (BDs) + * to span multiple servers. This is done by building an L2 overlay on + * top of an L3 network underlay using VXLAN tunnels. + * + * This makes it possible for servers to be co-located in the same data + * center or be separated geographically as long as they are reachable + * through the underlay L3 network. + * + * You can refer to this kind of L2 overlay bridge domain as a VXLAN + * (Virtual eXtensible VLAN) segment. + */ + + +vxlan_main_t vxlan_main; + +static u32 +vxlan_eth_flag_change (vnet_main_t *vnm, vnet_hw_interface_t *hi, u32 flags) +{ + /* nothing for now */ + return 0; +} + +static clib_error_t * +vxlan_eth_set_max_frame_size (vnet_main_t *vnm, vnet_hw_interface_t *hw, + u32 frame_size) +{ + /* nothing for now */ + return 0; +} + +static u8 * +format_decap_next (u8 * s, va_list * args) +{ + u32 next_index = va_arg (*args, u32); + + if (next_index == VXLAN_INPUT_NEXT_DROP) + return format (s, "drop"); + else + return format (s, "index %d", next_index); + return s; +} + +u8 * +format_vxlan_tunnel (u8 * s, va_list * args) +{ + vxlan_tunnel_t *t = va_arg (*args, vxlan_tunnel_t *); + + s = format (s, + "[%d] instance %d src %U dst %U src_port %d dst_port %d vni %d " + "fib-idx %d sw-if-idx %d ", + t->dev_instance, t->user_instance, format_ip46_address, &t->src, + IP46_TYPE_ANY, format_ip46_address, &t->dst, IP46_TYPE_ANY, + t->src_port, t->dst_port, t->vni, t->encap_fib_index, + t->sw_if_index); + + s = format (s, "encap-dpo-idx %d ", t->next_dpo.dpoi_index); + + if (PREDICT_FALSE (t->decap_next_index != VXLAN_INPUT_NEXT_L2_INPUT)) + s = format (s, "decap-next-%U ", format_decap_next, t->decap_next_index); + + if (PREDICT_FALSE (ip46_address_is_multicast (&t->dst))) + s = format (s, "mcast-sw-if-idx %d ", t->mcast_sw_if_index); + + if (t->flow_index != ~0) + s = format (s, "flow-index %d [%U]", t->flow_index, + format_flow_enabled_hw, t->flow_index); + + return s; +} + +static u8 * +format_vxlan_name (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + vxlan_main_t *vxm = &vxlan_main; + vxlan_tunnel_t *t; + + if (dev_instance == ~0) + return format (s, "<cached-unused>"); + + if (dev_instance >= vec_len (vxm->tunnels)) + return format (s, "<improperly-referenced>"); + + t = pool_elt_at_index (vxm->tunnels, dev_instance); + + return format (s, "vxlan_tunnel%d", t->user_instance); +} + +static clib_error_t * +vxlan_interface_admin_up_down (vnet_main_t * vnm, u32 hw_if_index, u32 flags) +{ + u32 hw_flags = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) ? + VNET_HW_INTERFACE_FLAG_LINK_UP : 0; + vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags); + + return /* no error */ 0; +} + +/* *INDENT-OFF* */ +VNET_DEVICE_CLASS (vxlan_device_class, static) = { + .name = "VXLAN", + .format_device_name = format_vxlan_name, + .format_tx_trace = format_vxlan_encap_trace, + .admin_up_down_function = vxlan_interface_admin_up_down, +}; +/* *INDENT-ON* */ + +static u8 * +format_vxlan_header_with_length (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + s = format (s, "unimplemented dev %u", dev_instance); + return s; +} + +/* *INDENT-OFF* */ +VNET_HW_INTERFACE_CLASS (vxlan_hw_class) = { + .name = "VXLAN", + .format_header = format_vxlan_header_with_length, + .build_rewrite = default_build_rewrite, +}; +/* *INDENT-ON* */ + +static void +vxlan_tunnel_restack_dpo (vxlan_tunnel_t * t) +{ + u8 is_ip4 = ip46_address_is_ip4 (&t->dst); + dpo_id_t dpo = DPO_INVALID; + fib_forward_chain_type_t forw_type = is_ip4 ? + FIB_FORW_CHAIN_TYPE_UNICAST_IP4 : FIB_FORW_CHAIN_TYPE_UNICAST_IP6; + + fib_entry_contribute_forwarding (t->fib_entry_index, forw_type, &dpo); + + /* vxlan uses the payload hash as the udp source port + * hence the packet's hash is unknown + * skip single bucket load balance dpo's */ + while (DPO_LOAD_BALANCE == dpo.dpoi_type) + { + const load_balance_t *lb; + const dpo_id_t *choice; + + lb = load_balance_get (dpo.dpoi_index); + if (lb->lb_n_buckets > 1) + break; + + choice = load_balance_get_bucket_i (lb, 0); + + if (DPO_RECEIVE == choice->dpoi_type) + dpo_copy (&dpo, drop_dpo_get (choice->dpoi_proto)); + else + dpo_copy (&dpo, choice); + } + + u32 encap_index = is_ip4 ? + vxlan4_encap_node.index : vxlan6_encap_node.index; + dpo_stack_from_node (encap_index, &t->next_dpo, &dpo); + dpo_reset (&dpo); +} + +static vxlan_tunnel_t * +vxlan_tunnel_from_fib_node (fib_node_t * node) +{ + ASSERT (FIB_NODE_TYPE_VXLAN_TUNNEL == node->fn_type); + return ((vxlan_tunnel_t *) (((char *) node) - + STRUCT_OFFSET_OF (vxlan_tunnel_t, node))); +} + +/** + * Function definition to backwalk a FIB node - + * Here we will restack the new dpo of VXLAN DIP to encap node. + */ +static fib_node_back_walk_rc_t +vxlan_tunnel_back_walk (fib_node_t * node, fib_node_back_walk_ctx_t * ctx) +{ + vxlan_tunnel_restack_dpo (vxlan_tunnel_from_fib_node (node)); + return (FIB_NODE_BACK_WALK_CONTINUE); +} + +/** + * Function definition to get a FIB node from its index + */ +static fib_node_t * +vxlan_tunnel_fib_node_get (fib_node_index_t index) +{ + vxlan_tunnel_t *t; + vxlan_main_t *vxm = &vxlan_main; + + t = pool_elt_at_index (vxm->tunnels, index); + + return (&t->node); +} + +/** + * Function definition to inform the FIB node that its last lock has gone. + */ +static void +vxlan_tunnel_last_lock_gone (fib_node_t * node) +{ + /* + * The VXLAN tunnel is a root of the graph. As such + * it never has children and thus is never locked. + */ + ASSERT (0); +} + +/* + * Virtual function table registered by VXLAN tunnels + * for participation in the FIB object graph. + */ +const static fib_node_vft_t vxlan_vft = { + .fnv_get = vxlan_tunnel_fib_node_get, + .fnv_last_lock = vxlan_tunnel_last_lock_gone, + .fnv_back_walk = vxlan_tunnel_back_walk, +}; + +#define foreach_copy_field \ + _ (vni) \ + _ (mcast_sw_if_index) \ + _ (encap_fib_index) \ + _ (decap_next_index) \ + _ (src) \ + _ (dst) \ + _ (src_port) \ + _ (dst_port) + +static void +vxlan_rewrite (vxlan_tunnel_t * t, bool is_ip6) +{ + union + { + ip4_vxlan_header_t h4; + ip6_vxlan_header_t h6; + } h; + int len = is_ip6 ? sizeof h.h6 : sizeof h.h4; + + udp_header_t *udp; + vxlan_header_t *vxlan; + /* Fixed portion of the (outer) ip header */ + + clib_memset (&h, 0, sizeof (h)); + if (!is_ip6) + { + ip4_header_t *ip = &h.h4.ip4; + udp = &h.h4.udp, vxlan = &h.h4.vxlan; + ip->ip_version_and_header_length = 0x45; + ip->ttl = 254; + ip->protocol = IP_PROTOCOL_UDP; + + ip->src_address = t->src.ip4; + ip->dst_address = t->dst.ip4; + + /* we fix up the ip4 header length and checksum after-the-fact */ + ip->checksum = ip4_header_checksum (ip); + } + else + { + ip6_header_t *ip = &h.h6.ip6; + udp = &h.h6.udp, vxlan = &h.h6.vxlan; + ip->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32 (6 << 28); + ip->hop_limit = 255; + ip->protocol = IP_PROTOCOL_UDP; + + ip->src_address = t->src.ip6; + ip->dst_address = t->dst.ip6; + } + + /* UDP header, randomize src port on something, maybe? */ + udp->src_port = clib_host_to_net_u16 (t->src_port); + udp->dst_port = clib_host_to_net_u16 (t->dst_port); + + /* VXLAN header */ + vnet_set_vni_and_flags (vxlan, t->vni); + vnet_rewrite_set_data (*t, &h, len); +} + +static bool +vxlan_decap_next_is_valid (vxlan_main_t * vxm, u32 is_ip6, + u32 decap_next_index) +{ + vlib_main_t *vm = vxm->vlib_main; + u32 input_idx = (!is_ip6) ? + vxlan4_input_node.index : vxlan6_input_node.index; + vlib_node_runtime_t *r = vlib_node_get_runtime (vm, input_idx); + + return decap_next_index < r->n_next_nodes; +} + +/* *INDENT-OFF* */ +typedef CLIB_PACKED(union +{ + struct + { + fib_node_index_t mfib_entry_index; + adj_index_t mcast_adj_index; + }; + u64 as_u64; +}) mcast_shared_t; +/* *INDENT-ON* */ + +static inline mcast_shared_t +mcast_shared_get (ip46_address_t * ip) +{ + ASSERT (ip46_address_is_multicast (ip)); + uword *p = hash_get_mem (vxlan_main.mcast_shared, ip); + ALWAYS_ASSERT (p); + mcast_shared_t ret = {.as_u64 = *p }; + return ret; +} + +static inline void +mcast_shared_add (ip46_address_t * dst, fib_node_index_t mfei, adj_index_t ai) +{ + mcast_shared_t new_ep = { + .mcast_adj_index = ai, + .mfib_entry_index = mfei, + }; + + hash_set_mem_alloc (&vxlan_main.mcast_shared, dst, new_ep.as_u64); +} + +static inline void +mcast_shared_remove (ip46_address_t * dst) +{ + mcast_shared_t ep = mcast_shared_get (dst); + + adj_unlock (ep.mcast_adj_index); + mfib_table_entry_delete_index (ep.mfib_entry_index, MFIB_SOURCE_VXLAN); + + hash_unset_mem_free (&vxlan_main.mcast_shared, dst); +} + +int vnet_vxlan_add_del_tunnel + (vnet_vxlan_add_del_tunnel_args_t * a, u32 * sw_if_indexp) +{ + vxlan_main_t *vxm = &vxlan_main; + vnet_main_t *vnm = vxm->vnet_main; + vxlan_decap_info_t *p; + u32 sw_if_index = ~0; + vxlan4_tunnel_key_t key4; + vxlan6_tunnel_key_t key6; + u32 is_ip6 = a->is_ip6; + vlib_main_t *vm = vlib_get_main (); + u8 hw_addr[6]; + + /* Set udp-ports */ + if (a->src_port == 0) + a->src_port = is_ip6 ? UDP_DST_PORT_vxlan6 : UDP_DST_PORT_vxlan; + + if (a->dst_port == 0) + a->dst_port = is_ip6 ? UDP_DST_PORT_vxlan6 : UDP_DST_PORT_vxlan; + + int not_found; + if (!is_ip6) + { + /* ip4 mcast is indexed by mcast addr only */ + key4.key[0] = ip46_address_is_multicast (&a->dst) ? + a->dst.ip4.as_u32 : + a->dst.ip4.as_u32 | (((u64) a->src.ip4.as_u32) << 32); + key4.key[1] = ((u64) clib_host_to_net_u16 (a->src_port) << 48) | + (((u64) a->encap_fib_index) << 32) | + clib_host_to_net_u32 (a->vni << 8); + not_found = + clib_bihash_search_inline_16_8 (&vxm->vxlan4_tunnel_by_key, &key4); + p = (void *) &key4.value; + } + else + { + key6.key[0] = a->dst.ip6.as_u64[0]; + key6.key[1] = a->dst.ip6.as_u64[1]; + key6.key[2] = (((u64) clib_host_to_net_u16 (a->src_port) << 48) | + ((u64) a->encap_fib_index) << 32) | + clib_host_to_net_u32 (a->vni << 8); + not_found = + clib_bihash_search_inline_24_8 (&vxm->vxlan6_tunnel_by_key, &key6); + p = (void *) &key6.value; + } + + if (not_found) + p = 0; + + if (a->is_add) + { + l2input_main_t *l2im = &l2input_main; + u32 dev_instance; /* real dev instance tunnel index */ + u32 user_instance; /* request and actual instance number */ + + /* adding a tunnel: tunnel must not already exist */ + if (p) + return VNET_API_ERROR_TUNNEL_EXIST; + + /*if not set explicitly, default to l2 */ + if (a->decap_next_index == ~0) + a->decap_next_index = VXLAN_INPUT_NEXT_L2_INPUT; + if (!vxlan_decap_next_is_valid (vxm, is_ip6, a->decap_next_index)) + return VNET_API_ERROR_INVALID_DECAP_NEXT; + + vxlan_tunnel_t *t; + pool_get_aligned (vxm->tunnels, t, CLIB_CACHE_LINE_BYTES); + clib_memset (t, 0, sizeof (*t)); + dev_instance = t - vxm->tunnels; + + /* copy from arg structure */ +#define _(x) t->x = a->x; + foreach_copy_field; +#undef _ + + vxlan_rewrite (t, is_ip6); + /* + * Reconcile the real dev_instance and a possible requested instance. + */ + user_instance = a->instance; + if (user_instance == ~0) + user_instance = dev_instance; + if (hash_get (vxm->instance_used, user_instance)) + { + pool_put (vxm->tunnels, t); + return VNET_API_ERROR_INSTANCE_IN_USE; + } + + hash_set (vxm->instance_used, user_instance, 1); + + t->dev_instance = dev_instance; /* actual */ + t->user_instance = user_instance; /* name */ + t->flow_index = ~0; + + if (a->is_l3) + t->hw_if_index = + vnet_register_interface (vnm, vxlan_device_class.index, dev_instance, + vxlan_hw_class.index, dev_instance); + else + { + vnet_eth_interface_registration_t eir = {}; + f64 now = vlib_time_now (vm); + u32 rnd; + rnd = (u32) (now * 1e6); + rnd = random_u32 (&rnd); + memcpy (hw_addr + 2, &rnd, sizeof (rnd)); + hw_addr[0] = 2; + hw_addr[1] = 0xfe; + + eir.dev_class_index = vxlan_device_class.index; + eir.dev_instance = dev_instance; + eir.address = hw_addr; + eir.cb.flag_change = vxlan_eth_flag_change; + eir.cb.set_max_frame_size = vxlan_eth_set_max_frame_size; + t->hw_if_index = vnet_eth_register_interface (vnm, &eir); + } + + vnet_hw_interface_t *hi = vnet_get_hw_interface (vnm, t->hw_if_index); + + /* Set vxlan tunnel output node */ + u32 encap_index = !is_ip6 ? + vxlan4_encap_node.index : vxlan6_encap_node.index; + vnet_set_interface_output_node (vnm, t->hw_if_index, encap_index); + + t->sw_if_index = sw_if_index = hi->sw_if_index; + + /* copy the key */ + int add_failed; + if (is_ip6) + { + key6.value = (u64) dev_instance; + add_failed = clib_bihash_add_del_24_8 (&vxm->vxlan6_tunnel_by_key, + &key6, 1 /*add */ ); + } + else + { + vxlan_decap_info_t di = {.sw_if_index = t->sw_if_index, }; + if (ip46_address_is_multicast (&t->dst)) + di.local_ip = t->src.ip4; + else + di.next_index = t->decap_next_index; + key4.value = di.as_u64; + add_failed = clib_bihash_add_del_16_8 (&vxm->vxlan4_tunnel_by_key, + &key4, 1 /*add */ ); + } + + if (add_failed) + { + if (a->is_l3) + vnet_delete_hw_interface (vnm, t->hw_if_index); + else + ethernet_delete_interface (vnm, t->hw_if_index); + hash_unset (vxm->instance_used, t->user_instance); + pool_put (vxm->tunnels, t); + return VNET_API_ERROR_INVALID_REGISTRATION; + } + + vec_validate_init_empty (vxm->tunnel_index_by_sw_if_index, sw_if_index, + ~0); + vxm->tunnel_index_by_sw_if_index[sw_if_index] = dev_instance; + + /* setup l2 input config with l2 feature and bd 0 to drop packet */ + vec_validate (l2im->configs, sw_if_index); + l2im->configs[sw_if_index].feature_bitmap = L2INPUT_FEAT_DROP; + l2im->configs[sw_if_index].bd_index = 0; + + vnet_sw_interface_t *si = vnet_get_sw_interface (vnm, sw_if_index); + si->flags &= ~VNET_SW_INTERFACE_FLAG_HIDDEN; + vnet_sw_interface_set_flags (vnm, sw_if_index, + VNET_SW_INTERFACE_FLAG_ADMIN_UP); + + fib_node_init (&t->node, FIB_NODE_TYPE_VXLAN_TUNNEL); + fib_prefix_t tun_dst_pfx; + vnet_flood_class_t flood_class = VNET_FLOOD_CLASS_TUNNEL_NORMAL; + + fib_protocol_t fp = fib_ip_proto (is_ip6); + fib_prefix_from_ip46_addr (fp, &t->dst, &tun_dst_pfx); + if (!ip46_address_is_multicast (&t->dst)) + { + /* Unicast tunnel - + * source the FIB entry for the tunnel's destination + * and become a child thereof. The tunnel will then get poked + * when the forwarding for the entry updates, and the tunnel can + * re-stack accordingly + */ + vtep_addr_ref (&vxm->vtep_table, t->encap_fib_index, &t->src); + t->fib_entry_index = fib_entry_track (t->encap_fib_index, + &tun_dst_pfx, + FIB_NODE_TYPE_VXLAN_TUNNEL, + dev_instance, + &t->sibling_index); + vxlan_tunnel_restack_dpo (t); + } + else + { + /* Multicast tunnel - + * as the same mcast group can be used for multiple mcast tunnels + * with different VNIs, create the output fib adjacency only if + * it does not already exist + */ + if (vtep_addr_ref (&vxm->vtep_table, + t->encap_fib_index, &t->dst) == 1) + { + fib_node_index_t mfei; + adj_index_t ai; + fib_route_path_t path = { + .frp_proto = fib_proto_to_dpo (fp), + .frp_addr = zero_addr, + .frp_sw_if_index = 0xffffffff, + .frp_fib_index = ~0, + .frp_weight = 1, + .frp_flags = FIB_ROUTE_PATH_LOCAL, + .frp_mitf_flags = MFIB_ITF_FLAG_FORWARD, + }; + const mfib_prefix_t mpfx = { + .fp_proto = fp, + .fp_len = (is_ip6 ? 128 : 32), + .fp_grp_addr = tun_dst_pfx.fp_addr, + }; + + /* + * Setup the (*,G) to receive traffic on the mcast group + * - the forwarding interface is for-us + * - the accepting interface is that from the API + */ + mfib_table_entry_path_update (t->encap_fib_index, &mpfx, + MFIB_SOURCE_VXLAN, + MFIB_ENTRY_FLAG_NONE, &path); + + path.frp_sw_if_index = a->mcast_sw_if_index; + path.frp_flags = FIB_ROUTE_PATH_FLAG_NONE; + path.frp_mitf_flags = MFIB_ITF_FLAG_ACCEPT; + mfei = mfib_table_entry_path_update ( + t->encap_fib_index, &mpfx, MFIB_SOURCE_VXLAN, + MFIB_ENTRY_FLAG_NONE, &path); + + /* + * Create the mcast adjacency to send traffic to the group + */ + ai = adj_mcast_add_or_lock (fp, + fib_proto_to_link (fp), + a->mcast_sw_if_index); + + /* + * create a new end-point + */ + mcast_shared_add (&t->dst, mfei, ai); + } + + dpo_id_t dpo = DPO_INVALID; + mcast_shared_t ep = mcast_shared_get (&t->dst); + + /* Stack shared mcast dst mac addr rewrite on encap */ + dpo_set (&dpo, DPO_ADJACENCY_MCAST, + fib_proto_to_dpo (fp), ep.mcast_adj_index); + + dpo_stack_from_node (encap_index, &t->next_dpo, &dpo); + dpo_reset (&dpo); + flood_class = VNET_FLOOD_CLASS_TUNNEL_MASTER; + } + + vnet_get_sw_interface (vnet_get_main (), sw_if_index)->flood_class = + flood_class; + } + else + { + /* deleting a tunnel: tunnel must exist */ + if (!p) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + u32 instance = is_ip6 ? key6.value : + vxm->tunnel_index_by_sw_if_index[p->sw_if_index]; + vxlan_tunnel_t *t = pool_elt_at_index (vxm->tunnels, instance); + + sw_if_index = t->sw_if_index; + vnet_sw_interface_set_flags (vnm, sw_if_index, 0 /* down */ ); + + vxm->tunnel_index_by_sw_if_index[sw_if_index] = ~0; + + if (!is_ip6) + clib_bihash_add_del_16_8 (&vxm->vxlan4_tunnel_by_key, &key4, + 0 /*del */ ); + else + clib_bihash_add_del_24_8 (&vxm->vxlan6_tunnel_by_key, &key6, + 0 /*del */ ); + + if (!ip46_address_is_multicast (&t->dst)) + { + if (t->flow_index != ~0) + vnet_flow_del (vnm, t->flow_index); + + vtep_addr_unref (&vxm->vtep_table, t->encap_fib_index, &t->src); + fib_entry_untrack (t->fib_entry_index, t->sibling_index); + } + else if (vtep_addr_unref (&vxm->vtep_table, + t->encap_fib_index, &t->dst) == 0) + { + mcast_shared_remove (&t->dst); + } + + vnet_hw_interface_t *hw = vnet_get_hw_interface (vnm, t->hw_if_index); + if (hw->dev_class_index == vxlan_device_class.index) + vnet_delete_hw_interface (vnm, t->hw_if_index); + else + ethernet_delete_interface (vnm, t->hw_if_index); + hash_unset (vxm->instance_used, t->user_instance); + + fib_node_deinit (&t->node); + pool_put (vxm->tunnels, t); + } + + if (sw_if_indexp) + *sw_if_indexp = sw_if_index; + + if (a->is_add) + { + /* register udp ports */ + if (!is_ip6 && !udp_is_valid_dst_port (a->src_port, 1)) + udp_register_dst_port (vxm->vlib_main, a->src_port, + vxlan4_input_node.index, 1); + if (is_ip6 && !udp_is_valid_dst_port (a->src_port, 0)) + udp_register_dst_port (vxm->vlib_main, a->src_port, + vxlan6_input_node.index, 0); + } + + return 0; +} + +static uword +get_decap_next_for_node (u32 node_index, u32 ipv4_set) +{ + vxlan_main_t *vxm = &vxlan_main; + vlib_main_t *vm = vxm->vlib_main; + uword input_node = (ipv4_set) ? vxlan4_input_node.index : + vxlan6_input_node.index; + + return vlib_node_add_next (vm, input_node, node_index); +} + +static uword +unformat_decap_next (unformat_input_t * input, va_list * args) +{ + u32 *result = va_arg (*args, u32 *); + u32 ipv4_set = va_arg (*args, int); + vxlan_main_t *vxm = &vxlan_main; + vlib_main_t *vm = vxm->vlib_main; + u32 node_index; + u32 tmp; + + if (unformat (input, "l2")) + *result = VXLAN_INPUT_NEXT_L2_INPUT; + else if (unformat (input, "node %U", unformat_vlib_node, vm, &node_index)) + *result = get_decap_next_for_node (node_index, ipv4_set); + else if (unformat (input, "%d", &tmp)) + *result = tmp; + else + return 0; + return 1; +} + +static clib_error_t * +vxlan_add_del_tunnel_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + ip46_address_t src = ip46_address_initializer, dst = + ip46_address_initializer; + u8 is_add = 1; + u8 src_set = 0; + u8 dst_set = 0; + u8 grp_set = 0; + u8 ipv4_set = 0; + u8 ipv6_set = 0; + u8 is_l3 = 0; + u32 instance = ~0; + u32 encap_fib_index = 0; + u32 mcast_sw_if_index = ~0; + u32 decap_next_index = VXLAN_INPUT_NEXT_L2_INPUT; + u32 vni = 0; + u32 src_port = 0; + u32 dst_port = 0; + u32 table_id; + clib_error_t *parse_error = NULL; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "del")) + { + is_add = 0; + } + else if (unformat (line_input, "instance %d", &instance)) + ; + else if (unformat (line_input, "src %U", + unformat_ip46_address, &src, IP46_TYPE_ANY)) + { + src_set = 1; + ip46_address_is_ip4 (&src) ? (ipv4_set = 1) : (ipv6_set = 1); + } + else if (unformat (line_input, "dst %U", + unformat_ip46_address, &dst, IP46_TYPE_ANY)) + { + dst_set = 1; + ip46_address_is_ip4 (&dst) ? (ipv4_set = 1) : (ipv6_set = 1); + } + else if (unformat (line_input, "group %U %U", + unformat_ip46_address, &dst, IP46_TYPE_ANY, + unformat_vnet_sw_interface, + vnet_get_main (), &mcast_sw_if_index)) + { + grp_set = dst_set = 1; + ip46_address_is_ip4 (&dst) ? (ipv4_set = 1) : (ipv6_set = 1); + } + else if (unformat (line_input, "encap-vrf-id %d", &table_id)) + { + encap_fib_index = + fib_table_find (fib_ip_proto (ipv6_set), table_id); + } + else if (unformat (line_input, "l3")) + is_l3 = 1; + else if (unformat (line_input, "decap-next %U", unformat_decap_next, + &decap_next_index, ipv4_set)) + ; + else if (unformat (line_input, "vni %d", &vni)) + ; + else if (unformat (line_input, "src_port %d", &src_port)) + ; + else if (unformat (line_input, "dst_port %d", &dst_port)) + ; + else + { + parse_error = clib_error_return (0, "parse error: '%U'", + format_unformat_error, line_input); + break; + } + } + + unformat_free (line_input); + + if (parse_error) + return parse_error; + + if (is_l3 && decap_next_index == VXLAN_INPUT_NEXT_L2_INPUT) + { + vlib_node_t *node = vlib_get_node_by_name ( + vm, (u8 *) (ipv4_set ? "ip4-input" : "ip6-input")); + decap_next_index = get_decap_next_for_node (node->index, ipv4_set); + } + + if (encap_fib_index == ~0) + return clib_error_return (0, "nonexistent encap-vrf-id %d", table_id); + + if (src_set == 0) + return clib_error_return (0, "tunnel src address not specified"); + + if (dst_set == 0) + return clib_error_return (0, "tunnel dst address not specified"); + + if (grp_set && !ip46_address_is_multicast (&dst)) + return clib_error_return (0, "tunnel group address not multicast"); + + if (grp_set == 0 && ip46_address_is_multicast (&dst)) + return clib_error_return (0, "dst address must be unicast"); + + if (grp_set && mcast_sw_if_index == ~0) + return clib_error_return (0, "tunnel nonexistent multicast device"); + + if (ipv4_set && ipv6_set) + return clib_error_return (0, "both IPv4 and IPv6 addresses specified"); + + if (ip46_address_cmp (&src, &dst) == 0) + return clib_error_return (0, "src and dst addresses are identical"); + + if (decap_next_index == ~0) + return clib_error_return (0, "next node not found"); + + if (vni == 0) + return clib_error_return (0, "vni not specified"); + + if (vni >> 24) + return clib_error_return (0, "vni %d out of range", vni); + + vnet_vxlan_add_del_tunnel_args_t a = { .is_add = is_add, + .is_ip6 = ipv6_set, + .is_l3 = is_l3, + .instance = instance, +#define _(x) .x = x, + foreach_copy_field +#undef _ + }; + + u32 tunnel_sw_if_index; + int rv = vnet_vxlan_add_del_tunnel (&a, &tunnel_sw_if_index); + + switch (rv) + { + case 0: + if (is_add) + vlib_cli_output (vm, "%U\n", format_vnet_sw_if_index_name, + vnet_get_main (), tunnel_sw_if_index); + break; + + case VNET_API_ERROR_TUNNEL_EXIST: + return clib_error_return (0, "tunnel already exists..."); + + case VNET_API_ERROR_NO_SUCH_ENTRY: + return clib_error_return (0, "tunnel does not exist..."); + + case VNET_API_ERROR_INSTANCE_IN_USE: + return clib_error_return (0, "Instance is in use"); + + default: + return clib_error_return + (0, "vnet_vxlan_add_del_tunnel returned %d", rv); + } + + return 0; +} + +/*? + * Add or delete a VXLAN Tunnel. + * + * VXLAN provides the features needed to allow L2 bridge domains (BDs) + * to span multiple servers. This is done by building an L2 overlay on + * top of an L3 network underlay using VXLAN tunnels. + * + * This makes it possible for servers to be co-located in the same data + * center or be separated geographically as long as they are reachable + * through the underlay L3 network. + * + * You can refer to this kind of L2 overlay bridge domain as a VXLAN + * (Virtual eXtensible VLAN) segment. + * + * @cliexpar + * Example of how to create a VXLAN Tunnel: + * @cliexcmd{create vxlan tunnel src 10.0.3.1 dst 10.0.3.3 vni 13 encap-vrf-id + 7} + * Example of how to create a VXLAN Tunnel with a known name, vxlan_tunnel42: + * @cliexcmd{create vxlan tunnel src 10.0.3.1 dst 10.0.3.3 instance 42} + * Example of how to create a multicast VXLAN Tunnel with a known name, + vxlan_tunnel23: + * @cliexcmd{create vxlan tunnel src 10.0.3.1 group 239.1.1.1 + GigabitEthernet0/8/0 instance 23} + * Example of how to create a VXLAN Tunnel with custom udp-ports: + * @cliexcmd{create vxlan tunnel src 10.0.3.1 dst 10.0.3.3 vni 13 src_port + 59000 dst_port 59001} + * Example of how to delete a VXLAN Tunnel: + * @cliexcmd{create vxlan tunnel src 10.0.3.1 dst 10.0.3.3 vni 13 del} + ?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (create_vxlan_tunnel_command, static) = { + .path = "create vxlan tunnel", + .short_help = + "create vxlan tunnel src <local-vtep-addr>" + " {dst <remote-vtep-addr>|group <mcast-vtep-addr> <intf-name>} vni <nn>" + " [instance <id>]" + " [encap-vrf-id <nn>] [decap-next [l2|node <name>]] [del] [l3]" + " [src_port <local-vtep-udp-port>] [dst_port <remote-vtep-udp-port>]", + .function = vxlan_add_del_tunnel_command_fn, +}; +/* *INDENT-ON* */ + +static clib_error_t * +show_vxlan_tunnel_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vxlan_main_t *vxm = &vxlan_main; + vxlan_tunnel_t *t; + int raw = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "raw")) + raw = 1; + else + return clib_error_return (0, "parse error: '%U'", + format_unformat_error, input); + } + + if (pool_elts (vxm->tunnels) == 0) + vlib_cli_output (vm, "No vxlan tunnels configured..."); + +/* *INDENT-OFF* */ + pool_foreach (t, vxm->tunnels) + { + vlib_cli_output (vm, "%U", format_vxlan_tunnel, t); + } +/* *INDENT-ON* */ + + if (raw) + { + vlib_cli_output (vm, "Raw IPv4 Hash Table:\n%U\n", + format_bihash_16_8, &vxm->vxlan4_tunnel_by_key, + 1 /* verbose */ ); + vlib_cli_output (vm, "Raw IPv6 Hash Table:\n%U\n", + format_bihash_24_8, &vxm->vxlan6_tunnel_by_key, + 1 /* verbose */ ); + } + + return 0; +} + +/*? + * Display all the VXLAN Tunnel entries. + * + * @cliexpar + * Example of how to display the VXLAN Tunnel entries: + * @cliexstart{show vxlan tunnel} + * [0] src 10.0.3.1 dst 10.0.3.3 src_port 4789 dst_port 4789 vni 13 + encap_fib_index 0 sw_if_index 5 decap_next l2 + * @cliexend + ?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (show_vxlan_tunnel_command, static) = { + .path = "show vxlan tunnel", + .short_help = "show vxlan tunnel [raw]", + .function = show_vxlan_tunnel_command_fn, +}; +/* *INDENT-ON* */ + + +void +vnet_int_vxlan_bypass_mode (u32 sw_if_index, u8 is_ip6, u8 is_enable) +{ + vxlan_main_t *vxm = &vxlan_main; + + if (pool_is_free_index (vxm->vnet_main->interface_main.sw_interfaces, + sw_if_index)) + return; + + is_enable = ! !is_enable; + + if (is_ip6) + { + if (clib_bitmap_get (vxm->bm_ip6_bypass_enabled_by_sw_if, sw_if_index) + != is_enable) + { + vnet_feature_enable_disable ("ip6-unicast", "ip6-vxlan-bypass", + sw_if_index, is_enable, 0, 0); + vxm->bm_ip6_bypass_enabled_by_sw_if = + clib_bitmap_set (vxm->bm_ip6_bypass_enabled_by_sw_if, + sw_if_index, is_enable); + } + } + else + { + if (clib_bitmap_get (vxm->bm_ip4_bypass_enabled_by_sw_if, sw_if_index) + != is_enable) + { + vnet_feature_enable_disable ("ip4-unicast", "ip4-vxlan-bypass", + sw_if_index, is_enable, 0, 0); + vxm->bm_ip4_bypass_enabled_by_sw_if = + clib_bitmap_set (vxm->bm_ip4_bypass_enabled_by_sw_if, + sw_if_index, is_enable); + } + } +} + + +static clib_error_t * +set_ip_vxlan_bypass (u32 is_ip6, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + clib_error_t *error = 0; + u32 sw_if_index, is_enable; + + sw_if_index = ~0; + is_enable = 1; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat_user + (line_input, unformat_vnet_sw_interface, vnm, &sw_if_index)) + ; + else if (unformat (line_input, "del")) + is_enable = 0; + else + { + error = unformat_parse_error (line_input); + goto done; + } + } + + if (~0 == sw_if_index) + { + error = clib_error_return (0, "unknown interface `%U'", + format_unformat_error, line_input); + goto done; + } + + vnet_int_vxlan_bypass_mode (sw_if_index, is_ip6, is_enable); + +done: + unformat_free (line_input); + + return error; +} + +static clib_error_t * +set_ip4_vxlan_bypass (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + return set_ip_vxlan_bypass (0, input, cmd); +} + +/*? + * This command adds the 'ip4-vxlan-bypass' graph node for a given interface. + * By adding the IPv4 vxlan-bypass graph node to an interface, the node checks + * for and validate input vxlan packet and bypass ip4-lookup, ip4-local, + * ip4-udp-lookup nodes to speedup vxlan packet forwarding. This node will + * cause extra overhead to for non-vxlan packets which is kept at a minimum. + * + * @cliexpar + * @parblock + * Example of graph node before ip4-vxlan-bypass is enabled: + * @cliexstart{show vlib graph ip4-vxlan-bypass} + * Name Next Previous + * ip4-vxlan-bypass error-drop [0] + * vxlan4-input [1] + * ip4-lookup [2] + * @cliexend + * + * Example of how to enable ip4-vxlan-bypass on an interface: + * @cliexcmd{set interface ip vxlan-bypass GigabitEthernet2/0/0} + * + * Example of graph node after ip4-vxlan-bypass is enabled: + * @cliexstart{show vlib graph ip4-vxlan-bypass} + * Name Next Previous + * ip4-vxlan-bypass error-drop [0] ip4-input + * vxlan4-input [1] ip4-input-no-checksum + * ip4-lookup [2] + * @cliexend + * + * Example of how to display the feature enabled on an interface: + * @cliexstart{show ip interface features GigabitEthernet2/0/0} + * IP feature paths configured on GigabitEthernet2/0/0... + * ... + * ipv4 unicast: + * ip4-vxlan-bypass + * ip4-lookup + * ... + * @cliexend + * + * Example of how to disable ip4-vxlan-bypass on an interface: + * @cliexcmd{set interface ip vxlan-bypass GigabitEthernet2/0/0 del} + * @endparblock +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_interface_ip_vxlan_bypass_command, static) = { + .path = "set interface ip vxlan-bypass", + .function = set_ip4_vxlan_bypass, + .short_help = "set interface ip vxlan-bypass <interface> [del]", +}; +/* *INDENT-ON* */ + +static clib_error_t * +set_ip6_vxlan_bypass (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + return set_ip_vxlan_bypass (1, input, cmd); +} + +/*? + * This command adds the 'ip6-vxlan-bypass' graph node for a given interface. + * By adding the IPv6 vxlan-bypass graph node to an interface, the node checks + * for and validate input vxlan packet and bypass ip6-lookup, ip6-local, + * ip6-udp-lookup nodes to speedup vxlan packet forwarding. This node will + * cause extra overhead to for non-vxlan packets which is kept at a minimum. + * + * @cliexpar + * @parblock + * Example of graph node before ip6-vxlan-bypass is enabled: + * @cliexstart{show vlib graph ip6-vxlan-bypass} + * Name Next Previous + * ip6-vxlan-bypass error-drop [0] + * vxlan6-input [1] + * ip6-lookup [2] + * @cliexend + * + * Example of how to enable ip6-vxlan-bypass on an interface: + * @cliexcmd{set interface ip6 vxlan-bypass GigabitEthernet2/0/0} + * + * Example of graph node after ip6-vxlan-bypass is enabled: + * @cliexstart{show vlib graph ip6-vxlan-bypass} + * Name Next Previous + * ip6-vxlan-bypass error-drop [0] ip6-input + * vxlan6-input [1] ip4-input-no-checksum + * ip6-lookup [2] + * @cliexend + * + * Example of how to display the feature enabled on an interface: + * @cliexstart{show ip interface features GigabitEthernet2/0/0} + * IP feature paths configured on GigabitEthernet2/0/0... + * ... + * ipv6 unicast: + * ip6-vxlan-bypass + * ip6-lookup + * ... + * @cliexend + * + * Example of how to disable ip6-vxlan-bypass on an interface: + * @cliexcmd{set interface ip6 vxlan-bypass GigabitEthernet2/0/0 del} + * @endparblock +?*/ +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (set_interface_ip6_vxlan_bypass_command, static) = { + .path = "set interface ip6 vxlan-bypass", + .function = set_ip6_vxlan_bypass, + .short_help = "set interface ip6 vxlan-bypass <interface> [del]", +}; +/* *INDENT-ON* */ + +int +vnet_vxlan_add_del_rx_flow (u32 hw_if_index, u32 t_index, int is_add) +{ + vxlan_main_t *vxm = &vxlan_main; + vxlan_tunnel_t *t = pool_elt_at_index (vxm->tunnels, t_index); + vnet_main_t *vnm = vnet_get_main (); + if (is_add) + { + if (t->flow_index == ~0) + { + vxlan_main_t *vxm = &vxlan_main; + vnet_flow_t flow = { + .actions = + VNET_FLOW_ACTION_REDIRECT_TO_NODE | VNET_FLOW_ACTION_MARK | + VNET_FLOW_ACTION_BUFFER_ADVANCE, + .mark_flow_id = t->dev_instance + vxm->flow_id_start, + .redirect_node_index = vxlan4_flow_input_node.index, + .buffer_advance = sizeof (ethernet_header_t), + .type = VNET_FLOW_TYPE_IP4_VXLAN, + .ip4_vxlan = { + .protocol.prot = IP_PROTOCOL_UDP, + .src_addr.addr = t->dst.ip4, + .dst_addr.addr = t->src.ip4, + .src_addr.mask.as_u32 = ~0, + .dst_addr.mask.as_u32 = ~0, + .dst_port.port = t->src_port, + .dst_port.mask = 0xFF, + .vni = t->vni, + } + , + }; + vnet_flow_add (vnm, &flow, &t->flow_index); + } + return vnet_flow_enable (vnm, t->flow_index, hw_if_index); + } + /* flow index is removed when the tunnel is deleted */ + return vnet_flow_disable (vnm, t->flow_index, hw_if_index); +} + +u32 +vnet_vxlan_get_tunnel_index (u32 sw_if_index) +{ + vxlan_main_t *vxm = &vxlan_main; + + if (sw_if_index >= vec_len (vxm->tunnel_index_by_sw_if_index)) + return ~0; + return vxm->tunnel_index_by_sw_if_index[sw_if_index]; +} + +static clib_error_t * +vxlan_offload_command_fn (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + vnet_main_t *vnm = vnet_get_main (); + u32 rx_sw_if_index = ~0; + u32 hw_if_index = ~0; + int is_add = 1; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "hw %U", unformat_vnet_hw_interface, vnm, + &hw_if_index)) + continue; + if (unformat (line_input, "rx %U", unformat_vnet_sw_interface, vnm, + &rx_sw_if_index)) + continue; + if (unformat (line_input, "del")) + { + is_add = 0; + continue; + } + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + } + + if (rx_sw_if_index == ~0) + return clib_error_return (0, "missing rx interface"); + if (hw_if_index == ~0) + return clib_error_return (0, "missing hw interface"); + + u32 t_index = vnet_vxlan_get_tunnel_index (rx_sw_if_index);; + if (t_index == ~0) + return clib_error_return (0, "%U is not a vxlan tunnel", + format_vnet_sw_if_index_name, vnm, + rx_sw_if_index); + + vxlan_main_t *vxm = &vxlan_main; + vxlan_tunnel_t *t = pool_elt_at_index (vxm->tunnels, t_index); + + if (!ip46_address_is_ip4 (&t->dst)) + return clib_error_return (0, "currently only IPV4 tunnels are supported"); + + vnet_hw_interface_t *hw_if = vnet_get_hw_interface (vnm, hw_if_index); + ip4_main_t *im = &ip4_main; + u32 rx_fib_index = + vec_elt (im->fib_index_by_sw_if_index, hw_if->sw_if_index); + + if (t->encap_fib_index != rx_fib_index) + return clib_error_return (0, "interface/tunnel fib mismatch"); + + if (vnet_vxlan_add_del_rx_flow (hw_if_index, t_index, is_add)) + return clib_error_return (0, "error %s flow", + is_add ? "enabling" : "disabling"); + + return 0; +} + +/* *INDENT-OFF* */ +VLIB_CLI_COMMAND (vxlan_offload_command, static) = { + .path = "set flow-offload vxlan", + .short_help = + "set flow-offload vxlan hw <interface-name> rx <tunnel-name> [del]", + .function = vxlan_offload_command_fn, +}; +/* *INDENT-ON* */ + +#define VXLAN_HASH_NUM_BUCKETS (2 * 1024) +#define VXLAN_HASH_MEMORY_SIZE (1 << 20) + +clib_error_t * +vxlan_init (vlib_main_t * vm) +{ + vxlan_main_t *vxm = &vxlan_main; + + vxm->vnet_main = vnet_get_main (); + vxm->vlib_main = vm; + + vnet_flow_get_range (vxm->vnet_main, "vxlan", 1024 * 1024, + &vxm->flow_id_start); + + vxm->bm_ip4_bypass_enabled_by_sw_if = 0; + vxm->bm_ip6_bypass_enabled_by_sw_if = 0; + + /* initialize the ip6 hash */ + clib_bihash_init_16_8 (&vxm->vxlan4_tunnel_by_key, "vxlan4", + VXLAN_HASH_NUM_BUCKETS, VXLAN_HASH_MEMORY_SIZE); + clib_bihash_init_24_8 (&vxm->vxlan6_tunnel_by_key, "vxlan6", + VXLAN_HASH_NUM_BUCKETS, VXLAN_HASH_MEMORY_SIZE); + vxm->vtep_table = vtep_table_create (); + vxm->mcast_shared = hash_create_mem (0, + sizeof (ip46_address_t), + sizeof (mcast_shared_t)); + + fib_node_register_type (FIB_NODE_TYPE_VXLAN_TUNNEL, &vxlan_vft); + + return 0; +} + +VLIB_INIT_FUNCTION (vxlan_init); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/vxlan/vxlan.h b/src/plugins/vxlan/vxlan.h new file mode 100644 index 00000000000..72d82e79cf5 --- /dev/null +++ b/src/plugins/vxlan/vxlan.h @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vnet_vxlan_h +#define included_vnet_vxlan_h + +#include <vppinfra/error.h> +#include <vppinfra/hash.h> +#include <vppinfra/bihash_16_8.h> +#include <vppinfra/bihash_24_8.h> +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ip/vtep.h> +#include <vnet/l2/l2_input.h> +#include <vnet/l2/l2_output.h> +#include <vnet/l2/l2_bd.h> +#include <vnet/ethernet/ethernet.h> +#include <vxlan/vxlan_packet.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/ip6_packet.h> +#include <vnet/udp/udp_packet.h> +#include <vnet/dpo/dpo.h> +#include <vnet/adj/adj_types.h> + +/* *INDENT-OFF* */ +typedef CLIB_PACKED (struct { + ip4_header_t ip4; /* 20 bytes */ + udp_header_t udp; /* 8 bytes */ + vxlan_header_t vxlan; /* 8 bytes */ +}) ip4_vxlan_header_t; + +typedef CLIB_PACKED (struct { + ip6_header_t ip6; /* 40 bytes */ + udp_header_t udp; /* 8 bytes */ + vxlan_header_t vxlan; /* 8 bytes */ +}) ip6_vxlan_header_t; +/* *INDENT-ON* */ + +/* +* Key fields: remote ip, vni on incoming VXLAN packet +* all fields in NET byte order +*/ +typedef clib_bihash_kv_16_8_t vxlan4_tunnel_key_t; + +/* +* Key fields: remote ip, vni and fib index on incoming VXLAN packet +* ip, vni fields in NET byte order +* fib index field in host byte order +*/ +typedef clib_bihash_kv_24_8_t vxlan6_tunnel_key_t; + +typedef union +{ + struct + { + u32 sw_if_index; /* unicast - input interface / mcast - stats interface */ + union + { + struct /* unicast action */ + { + u16 next_index; + u8 error; + }; + ip4_address_t local_ip; /* used as dst ip for mcast pkts to assign them to unicast tunnel */ + }; + }; + u64 as_u64; +} vxlan_decap_info_t; + +typedef struct +{ + /* Required for pool_get_aligned */ + CLIB_CACHE_LINE_ALIGN_MARK (cacheline0); + + /* FIB DPO for IP forwarding of VXLAN encap packet */ + dpo_id_t next_dpo; + + /* vxlan VNI in HOST byte order */ + u32 vni; + + /* tunnel src and dst addresses */ + ip46_address_t src; + ip46_address_t dst; + + /* udp-ports */ + u16 src_port; + u16 dst_port; + + /* mcast packet output intfc index (used only if dst is mcast) */ + u32 mcast_sw_if_index; + + /* decap next index */ + u16 decap_next_index; + + /* The FIB index for src/dst addresses */ + u32 encap_fib_index; + + /* vnet intfc index */ + u32 sw_if_index; + u32 hw_if_index; + + /** + * Linkage into the FIB object graph + */ + fib_node_t node; + + /* + * The FIB entry for (depending on VXLAN tunnel is unicast or mcast) + * sending unicast VXLAN encap packets or receiving mcast VXLAN packets + */ + fib_node_index_t fib_entry_index; + adj_index_t mcast_adj_index; + + /** + * The tunnel is a child of the FIB entry for its destination. This is + * so it receives updates when the forwarding information for that entry + * changes. + * The tunnels sibling index on the FIB entry's dependency list. + */ + u32 sibling_index; + + u32 flow_index; /* infra flow index */ + u32 dev_instance; /* Real device instance in tunnel vector */ + u32 user_instance; /* Instance name being shown to user */ + + VNET_DECLARE_REWRITE; +} vxlan_tunnel_t; + +#define foreach_vxlan_input_next \ +_(DROP, "error-drop") \ +_(L2_INPUT, "l2-input") + +typedef enum +{ +#define _(s,n) VXLAN_INPUT_NEXT_##s, + foreach_vxlan_input_next +#undef _ + VXLAN_INPUT_N_NEXT, +} vxlan_input_next_t; + +typedef enum +{ +#define vxlan_error(n,s) VXLAN_ERROR_##n, +#include <vxlan/vxlan_error.def> +#undef vxlan_error + VXLAN_N_ERROR, +} vxlan_input_error_t; + +typedef struct +{ + /* vector of encap tunnel instances */ + vxlan_tunnel_t *tunnels; + + /* lookup tunnel by key */ + clib_bihash_16_8_t + vxlan4_tunnel_by_key; /* keyed on ipv4.dst + src_port + fib + vni */ + clib_bihash_24_8_t + vxlan6_tunnel_by_key; /* keyed on ipv6.dst + src_port + fib + vni */ + + /* local VTEP IPs ref count used by vxlan-bypass node to check if + received VXLAN packet DIP matches any local VTEP address */ + vtep_table_t vtep_table; + + /* mcast shared info */ + uword *mcast_shared; /* keyed on mcast ip46 addr */ + + /* Mapping from sw_if_index to tunnel index */ + u32 *tunnel_index_by_sw_if_index; + + /* graph node state */ + uword *bm_ip4_bypass_enabled_by_sw_if; + uword *bm_ip6_bypass_enabled_by_sw_if; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + + /* Record used instances */ + uword *instance_used; + u32 flow_id_start; + + /* cache for last 8 vxlan tunnel */ + vtep4_cache_t vtep4_u512; + +} vxlan_main_t; + +extern vxlan_main_t vxlan_main; + +extern vlib_node_registration_t vxlan4_input_node; +extern vlib_node_registration_t vxlan6_input_node; +extern vlib_node_registration_t vxlan4_encap_node; +extern vlib_node_registration_t vxlan6_encap_node; +extern vlib_node_registration_t vxlan4_flow_input_node; + +u8 *format_vxlan_encap_trace (u8 * s, va_list * args); + +typedef struct +{ + u8 is_add; + + /* we normally use is_ip4, but since this adds to the + * structure, this seems less of a breaking change */ + u8 is_ip6; + u8 is_l3; + u32 instance; + ip46_address_t src, dst; + u32 mcast_sw_if_index; + u32 encap_fib_index; + u32 decap_next_index; + u32 vni; + u16 src_port; + u16 dst_port; +} vnet_vxlan_add_del_tunnel_args_t; + +int vnet_vxlan_add_del_tunnel + (vnet_vxlan_add_del_tunnel_args_t * a, u32 * sw_if_indexp); + +void vnet_int_vxlan_bypass_mode (u32 sw_if_index, u8 is_ip6, u8 is_enable); + +int vnet_vxlan_add_del_rx_flow (u32 hw_if_index, u32 t_imdex, int is_add); + +u32 vnet_vxlan_get_tunnel_index (u32 sw_if_index); +#endif /* included_vnet_vxlan_h */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/vxlan/vxlan_api.c b/src/plugins/vxlan/vxlan_api.c new file mode 100644 index 00000000000..8fd0928cc63 --- /dev/null +++ b/src/plugins/vxlan/vxlan_api.c @@ -0,0 +1,376 @@ +/* + *------------------------------------------------------------------ + * vxlan_api.c - vxlan api + * + * Copyright (c) 2016 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *------------------------------------------------------------------ + */ + +#include <vnet/vnet.h> +#include <vlibmemory/api.h> + +#include <vnet/interface.h> +#include <vnet/api_errno.h> +#include <vnet/feature/feature.h> +#include <vxlan/vxlan.h> +#include <vnet/fib/fib_table.h> +#include <vnet/ip/ip_types_api.h> +#include <vnet/udp/udp_local.h> +#include <vnet/format_fns.h> +#include <vxlan/vxlan.api_enum.h> +#include <vxlan/vxlan.api_types.h> + +static u16 msg_id_base; + +#define REPLY_MSG_ID_BASE msg_id_base +#include <vlibapi/api_helper_macros.h> + +static void +vl_api_vxlan_offload_rx_t_handler (vl_api_vxlan_offload_rx_t * mp) +{ + vl_api_vxlan_offload_rx_reply_t *rmp; + int rv = 0; + u32 hw_if_index = ntohl (mp->hw_if_index); + u32 sw_if_index = ntohl (mp->sw_if_index); + + if (!vnet_hw_interface_is_valid (vnet_get_main (), hw_if_index)) + { + rv = VNET_API_ERROR_NO_SUCH_ENTRY; + goto err; + } + VALIDATE_SW_IF_INDEX (mp); + + u32 t_index = vnet_vxlan_get_tunnel_index (sw_if_index); + if (t_index == ~0) + { + rv = VNET_API_ERROR_INVALID_SW_IF_INDEX_2; + goto err; + } + + vxlan_main_t *vxm = &vxlan_main; + vxlan_tunnel_t *t = pool_elt_at_index (vxm->tunnels, t_index); + if (!ip46_address_is_ip4 (&t->dst)) + { + rv = VNET_API_ERROR_INVALID_ADDRESS_FAMILY; + goto err; + } + + vnet_main_t *vnm = vnet_get_main (); + vnet_hw_interface_t *hw_if = vnet_get_hw_interface (vnm, hw_if_index); + ip4_main_t *im = &ip4_main; + u32 rx_fib_index = + vec_elt (im->fib_index_by_sw_if_index, hw_if->sw_if_index); + + if (t->encap_fib_index != rx_fib_index) + { + rv = VNET_API_ERROR_NO_SUCH_FIB; + goto err; + } + + if (vnet_vxlan_add_del_rx_flow (hw_if_index, t_index, mp->enable)) + { + rv = VNET_API_ERROR_UNSPECIFIED; + goto err; + } + BAD_SW_IF_INDEX_LABEL; +err: + + REPLY_MACRO (VL_API_VXLAN_OFFLOAD_RX_REPLY); +} + +static void + vl_api_sw_interface_set_vxlan_bypass_t_handler + (vl_api_sw_interface_set_vxlan_bypass_t * mp) +{ + vl_api_sw_interface_set_vxlan_bypass_reply_t *rmp; + int rv = 0; + u32 sw_if_index = ntohl (mp->sw_if_index); + + VALIDATE_SW_IF_INDEX (mp); + + vnet_int_vxlan_bypass_mode (sw_if_index, mp->is_ipv6, mp->enable); + BAD_SW_IF_INDEX_LABEL; + + REPLY_MACRO (VL_API_SW_INTERFACE_SET_VXLAN_BYPASS_REPLY); +} + +static int +vxlan_add_del_tunnel_clean_input (vnet_vxlan_add_del_tunnel_args_t *a, + u32 encap_vrf_id) +{ + a->is_ip6 = !ip46_address_is_ip4 (&a->src); + + a->encap_fib_index = fib_table_find (fib_ip_proto (a->is_ip6), encap_vrf_id); + if (a->encap_fib_index == ~0) + { + return VNET_API_ERROR_NO_SUCH_FIB; + } + + if (ip46_address_is_ip4 (&a->src) != ip46_address_is_ip4 (&a->dst)) + { + return VNET_API_ERROR_INVALID_VALUE; + } + + /* Check src & dst are different */ + if (ip46_address_cmp (&a->dst, &a->src) == 0) + { + return VNET_API_ERROR_SAME_SRC_DST; + } + if (ip46_address_is_multicast (&a->dst) && + !vnet_sw_if_index_is_api_valid (a->mcast_sw_if_index)) + { + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + } + return 0; +} + +static void +vl_api_vxlan_add_del_tunnel_t_handler (vl_api_vxlan_add_del_tunnel_t *mp) +{ + vl_api_vxlan_add_del_tunnel_reply_t *rmp; + u32 sw_if_index = ~0; + int rv = 0; + + vnet_vxlan_add_del_tunnel_args_t a = { + .is_add = mp->is_add, + .instance = ntohl (mp->instance), + .mcast_sw_if_index = ntohl (mp->mcast_sw_if_index), + .decap_next_index = ntohl (mp->decap_next_index), + .vni = ntohl (mp->vni), + }; + ip_address_decode (&mp->src_address, &a.src); + ip_address_decode (&mp->dst_address, &a.dst); + + rv = vxlan_add_del_tunnel_clean_input (&a, ntohl (mp->encap_vrf_id)); + if (rv) + goto out; + a.dst_port = a.is_ip6 ? UDP_DST_PORT_vxlan6 : UDP_DST_PORT_vxlan, + a.src_port = a.is_ip6 ? UDP_DST_PORT_vxlan6 : UDP_DST_PORT_vxlan, + rv = vnet_vxlan_add_del_tunnel (&a, &sw_if_index); + +out: + REPLY_MACRO2(VL_API_VXLAN_ADD_DEL_TUNNEL_REPLY, + ({ + rmp->sw_if_index = ntohl (sw_if_index); + })); +} + +static void +vl_api_vxlan_add_del_tunnel_v2_t_handler (vl_api_vxlan_add_del_tunnel_v2_t *mp) +{ + vl_api_vxlan_add_del_tunnel_v2_reply_t *rmp; + u32 sw_if_index = ~0; + int rv = 0; + + vnet_vxlan_add_del_tunnel_args_t a = { + .is_add = mp->is_add, + .instance = ntohl (mp->instance), + .mcast_sw_if_index = ntohl (mp->mcast_sw_if_index), + .decap_next_index = ntohl (mp->decap_next_index), + .vni = ntohl (mp->vni), + .dst_port = ntohs (mp->dst_port), + .src_port = ntohs (mp->src_port), + }; + + ip_address_decode (&mp->src_address, &a.src); + ip_address_decode (&mp->dst_address, &a.dst); + + rv = vxlan_add_del_tunnel_clean_input (&a, ntohl (mp->encap_vrf_id)); + if (rv) + goto out; + rv = vnet_vxlan_add_del_tunnel (&a, &sw_if_index); +out: + REPLY_MACRO2 (VL_API_VXLAN_ADD_DEL_TUNNEL_V2_REPLY, + ({ rmp->sw_if_index = ntohl (sw_if_index); })); +} + +static void +vl_api_vxlan_add_del_tunnel_v3_t_handler (vl_api_vxlan_add_del_tunnel_v3_t *mp) +{ + vl_api_vxlan_add_del_tunnel_v3_reply_t *rmp; + u32 sw_if_index = ~0; + int rv = 0; + + vnet_vxlan_add_del_tunnel_args_t a = { + .is_add = mp->is_add, + .instance = ntohl (mp->instance), + .mcast_sw_if_index = ntohl (mp->mcast_sw_if_index), + .decap_next_index = ntohl (mp->decap_next_index), + .vni = ntohl (mp->vni), + .dst_port = ntohs (mp->dst_port), + .src_port = ntohs (mp->src_port), + .is_l3 = mp->is_l3, + }; + + ip_address_decode (&mp->src_address, &a.src); + ip_address_decode (&mp->dst_address, &a.dst); + + rv = vxlan_add_del_tunnel_clean_input (&a, ntohl (mp->encap_vrf_id)); + if (rv) + goto out; + rv = vnet_vxlan_add_del_tunnel (&a, &sw_if_index); +out: + REPLY_MACRO2 (VL_API_VXLAN_ADD_DEL_TUNNEL_V3_REPLY, + ({ rmp->sw_if_index = ntohl (sw_if_index); })); +} + +static void send_vxlan_tunnel_details + (vxlan_tunnel_t * t, vl_api_registration_t * reg, u32 context) +{ + vl_api_vxlan_tunnel_details_t *rmp; + ip4_main_t *im4 = &ip4_main; + ip6_main_t *im6 = &ip6_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + clib_memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (REPLY_MSG_ID_BASE + VL_API_VXLAN_TUNNEL_DETAILS); + + ip_address_encode (&t->src, IP46_TYPE_ANY, &rmp->src_address); + ip_address_encode (&t->dst, IP46_TYPE_ANY, &rmp->dst_address); + + if (ip46_address_is_ip4 (&t->dst)) + rmp->encap_vrf_id = htonl (im4->fibs[t->encap_fib_index].ft_table_id); + else + rmp->encap_vrf_id = htonl (im6->fibs[t->encap_fib_index].ft_table_id); + + rmp->instance = htonl (t->user_instance); + rmp->mcast_sw_if_index = htonl (t->mcast_sw_if_index); + rmp->vni = htonl (t->vni); + rmp->decap_next_index = htonl (t->decap_next_index); + rmp->sw_if_index = htonl (t->sw_if_index); + rmp->context = context; + + vl_api_send_msg (reg, (u8 *) rmp); +} + +static void vl_api_vxlan_tunnel_dump_t_handler + (vl_api_vxlan_tunnel_dump_t * mp) +{ + vl_api_registration_t *reg; + vxlan_main_t *vxm = &vxlan_main; + vxlan_tunnel_t *t; + u32 sw_if_index; + + reg = vl_api_client_index_to_registration (mp->client_index); + if (!reg) + return; + + sw_if_index = ntohl (mp->sw_if_index); + + if (~0 == sw_if_index) + { + pool_foreach (t, vxm->tunnels) + send_vxlan_tunnel_details(t, reg, mp->context); + } + else + { + if ((sw_if_index >= vec_len (vxm->tunnel_index_by_sw_if_index)) || + (~0 == vxm->tunnel_index_by_sw_if_index[sw_if_index])) + { + return; + } + t = &vxm->tunnels[vxm->tunnel_index_by_sw_if_index[sw_if_index]]; + send_vxlan_tunnel_details (t, reg, mp->context); + } +} + +static void +send_vxlan_tunnel_v2_details (vxlan_tunnel_t *t, vl_api_registration_t *reg, + u32 context) +{ + vl_api_vxlan_tunnel_v2_details_t *rmp; + ip4_main_t *im4 = &ip4_main; + ip6_main_t *im6 = &ip6_main; + + rmp = vl_msg_api_alloc (sizeof (*rmp)); + clib_memset (rmp, 0, sizeof (*rmp)); + rmp->_vl_msg_id = ntohs (REPLY_MSG_ID_BASE + VL_API_VXLAN_TUNNEL_V2_DETAILS); + + ip_address_encode (&t->src, IP46_TYPE_ANY, &rmp->src_address); + ip_address_encode (&t->dst, IP46_TYPE_ANY, &rmp->dst_address); + rmp->src_port = htons (t->src_port); + rmp->dst_port = htons (t->dst_port); + + if (ip46_address_is_ip4 (&t->dst)) + rmp->encap_vrf_id = htonl (im4->fibs[t->encap_fib_index].ft_table_id); + else + rmp->encap_vrf_id = htonl (im6->fibs[t->encap_fib_index].ft_table_id); + + rmp->instance = htonl (t->user_instance); + rmp->mcast_sw_if_index = htonl (t->mcast_sw_if_index); + rmp->vni = htonl (t->vni); + rmp->decap_next_index = htonl (t->decap_next_index); + rmp->sw_if_index = htonl (t->sw_if_index); + rmp->context = context; + + vl_api_send_msg (reg, (u8 *) rmp); +} + +static void +vl_api_vxlan_tunnel_v2_dump_t_handler (vl_api_vxlan_tunnel_v2_dump_t *mp) +{ + vl_api_registration_t *reg; + vxlan_main_t *vxm = &vxlan_main; + vxlan_tunnel_t *t; + u32 sw_if_index; + + reg = vl_api_client_index_to_registration (mp->client_index); + if (!reg) + return; + + sw_if_index = ntohl (mp->sw_if_index); + + if (~0 == sw_if_index) + { + pool_foreach (t, vxm->tunnels) + send_vxlan_tunnel_v2_details (t, reg, mp->context); + } + else + { + if ((sw_if_index >= vec_len (vxm->tunnel_index_by_sw_if_index)) || + (~0 == vxm->tunnel_index_by_sw_if_index[sw_if_index])) + { + return; + } + t = &vxm->tunnels[vxm->tunnel_index_by_sw_if_index[sw_if_index]]; + send_vxlan_tunnel_v2_details (t, reg, mp->context); + } +} + +#include <vxlan/vxlan.api.c> +static clib_error_t * +vxlan_api_hookup (vlib_main_t * vm) +{ + api_main_t *am = vlibapi_get_main (); + + vl_api_increase_msg_trace_size (am, VL_API_VXLAN_ADD_DEL_TUNNEL, + 16 * sizeof (u32)); + + /* + * Set up the (msg_name, crc, message-id) table + */ + msg_id_base = setup_message_id_table (); + + return 0; +} + +VLIB_API_INIT_FUNCTION (vxlan_api_hookup); + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ diff --git a/src/plugins/vxlan/vxlan_error.def b/src/plugins/vxlan/vxlan_error.def new file mode 100644 index 00000000000..17f905950f5 --- /dev/null +++ b/src/plugins/vxlan/vxlan_error.def @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +vxlan_error (DECAPSULATED, "good packets decapsulated") +vxlan_error (NO_SUCH_TUNNEL, "no such tunnel packets") +vxlan_error (BAD_FLAGS, "packets with bad flags field in vxlan header") diff --git a/src/plugins/vxlan/vxlan_packet.h b/src/plugins/vxlan/vxlan_packet.h new file mode 100644 index 00000000000..d1d1ed813e5 --- /dev/null +++ b/src/plugins/vxlan/vxlan_packet.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_vxlan_packet_h__ +#define __included_vxlan_packet_h__ 1 + +/* + * From RFC-7348 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |R|R|R|R|I|R|R|R| Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | VXLAN Network Identifier (VNI) | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * VXLAN Header: This is an 8-byte field that has: + * + * - Flags (8 bits): where the I flag MUST be set to 1 for a valid + * VXLAN Network ID (VNI). The other 7 bits (designated "R") are + * reserved fields and MUST be set to zero on transmission and + * ignored on receipt. + * + * - VXLAN Segment ID/VXLAN Network Identifier (VNI): this is a + * 24-bit value used to designate the individual VXLAN overlay + * network on which the communicating VMs are situated. VMs in + * different VXLAN overlay networks cannot communicate with each + * other. + * + * - Reserved fields (24 bits and 8 bits): MUST be set to zero on + * transmission and ignored on receipt. + * + */ + +typedef struct +{ + u8 flags; + u8 res1; + u8 res2; + u8 res3; + u32 vni_reserved; +} vxlan_header_t; + +#define VXLAN_FLAGS_I 0x08 + +static inline u32 +vnet_get_vni (vxlan_header_t * h) +{ + u32 vni_reserved_host_byte_order; + + vni_reserved_host_byte_order = clib_net_to_host_u32 (h->vni_reserved); + return vni_reserved_host_byte_order >> 8; +} + +static inline void +vnet_set_vni_and_flags (vxlan_header_t * h, u32 vni) +{ + h->vni_reserved = clib_host_to_net_u32 (vni << 8); + *(u32 *) h = 0; + h->flags = VXLAN_FLAGS_I; +} + +#endif /* __included_vxlan_packet_h__ */ + +/* + * fd.io coding-style-patch-verification: ON + * + * Local Variables: + * eval: (c-set-style "gnu") + * End: + */ |