diff options
Diffstat (limited to 'src/plugins/pvti')
-rw-r--r-- | src/plugins/pvti/CMakeLists.txt | 40 | ||||
-rw-r--r-- | src/plugins/pvti/FEATURE.yaml | 8 | ||||
-rw-r--r-- | src/plugins/pvti/api.c | 137 | ||||
-rw-r--r-- | src/plugins/pvti/bypass-main.c | 79 | ||||
-rw-r--r-- | src/plugins/pvti/bypass.c | 202 | ||||
-rw-r--r-- | src/plugins/pvti/bypass.h | 53 | ||||
-rw-r--r-- | src/plugins/pvti/input-main.c | 115 | ||||
-rw-r--r-- | src/plugins/pvti/input.c | 496 | ||||
-rw-r--r-- | src/plugins/pvti/input.h | 87 | ||||
-rw-r--r-- | src/plugins/pvti/output-main.c | 85 | ||||
-rw-r--r-- | src/plugins/pvti/output.c | 543 | ||||
-rw-r--r-- | src/plugins/pvti/output.h | 75 | ||||
-rw-r--r-- | src/plugins/pvti/pvti.api | 111 | ||||
-rw-r--r-- | src/plugins/pvti/pvti.c | 481 | ||||
-rw-r--r-- | src/plugins/pvti/pvti.h | 257 | ||||
-rw-r--r-- | src/plugins/pvti/pvti_if.c | 376 | ||||
-rw-r--r-- | src/plugins/pvti/pvti_if.h | 47 |
17 files changed, 3192 insertions, 0 deletions
diff --git a/src/plugins/pvti/CMakeLists.txt b/src/plugins/pvti/CMakeLists.txt new file mode 100644 index 00000000000..900b662d54a --- /dev/null +++ b/src/plugins/pvti/CMakeLists.txt @@ -0,0 +1,40 @@ +# Copyright (c) 2024 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_vpp_plugin(pvti + SOURCES + pvti_if.c + pvti.c + input.h + input.c + input-main.c + output.h + output.c + output-main.c + bypass.h + bypass.c + bypass-main.c + api.c + pvti.h + + MULTIARCH_SOURCES + input.c + output.c + bypass.c + + API_FILES + pvti.api + + # API_TEST_SOURCES + # pvti_test.c +) diff --git a/src/plugins/pvti/FEATURE.yaml b/src/plugins/pvti/FEATURE.yaml new file mode 100644 index 00000000000..52dbe5b7c1b --- /dev/null +++ b/src/plugins/pvti/FEATURE.yaml @@ -0,0 +1,8 @@ +--- +name: Packet Vector Tunnel +maintainer: Andrew Yourtchenko <ayourtch@gmail.com> +features: + - support inner MTU up to ~8K over standard 1280..1500 MTU substrate +description: "Large MTU Tunnels" +state: development +properties: [API, CLI] diff --git a/src/plugins/pvti/api.c b/src/plugins/pvti/api.c new file mode 100644 index 00000000000..cda39ad44e8 --- /dev/null +++ b/src/plugins/pvti/api.c @@ -0,0 +1,137 @@ + +#include <vnet/vnet.h> +#include <vlibmemory/api.h> + +#include <vnet/format_fns.h> +#include <vnet/ip/ip_types_api.h> +#include <vlibapi/api.h> + +#include <pvti/pvti.api_enum.h> +#include <pvti/pvti.api_types.h> + +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +#define REPLY_MSG_ID_BASE pvm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +typedef struct +{ + vl_api_registration_t *reg; + u32 context; +} pvti_if_details_ctx_t; + +typedef struct +{ + +} pvti_interface_dump_ctx_t; + +static walk_rc_t +pvti_if_send_details (index_t pvtii, void *data) +{ + vl_api_pvti_interface_details_t *rmp; + pvti_if_details_ctx_t *ctx = data; + const pvti_if_t *pvi; + + pvi = pvti_if_get (pvtii); + + rmp = vl_msg_api_alloc_zero (sizeof (*rmp)); + rmp->_vl_msg_id = + htons (VL_API_PVTI_INTERFACE_DETAILS + pvti_main.msg_id_base); + + rmp->interface.sw_if_index = htonl (pvi->sw_if_index); + rmp->interface.local_port = htons (pvi->local_port); + rmp->interface.remote_port = htons (pvi->remote_port); + rmp->interface.underlay_mtu = htons (pvi->underlay_mtu); + + ip_address_encode2 (&pvi->local_ip, &rmp->interface.local_ip); + ip_address_encode2 (&pvi->remote_ip, &rmp->interface.remote_ip); + + rmp->context = ctx->context; + + vl_api_send_msg (ctx->reg, (u8 *) rmp); + + return (WALK_CONTINUE); +} + +static void +vl_api_pvti_interface_dump_t_handler (vl_api_pvti_interface_dump_t *mp) +{ + vl_api_registration_t *reg; + // pvti_main_t *pvm = &pvti_main; + + reg = vl_api_client_index_to_registration (mp->client_index); + if (reg == 0) + return; + + pvti_if_details_ctx_t ctx = { + .reg = reg, + .context = mp->context, + }; + + u32 sw_if_index = ntohl (mp->sw_if_index); + if (sw_if_index == ~0) + pvti_if_walk (pvti_if_send_details, &ctx); + else + { + index_t pvtii = pvti_if_find_by_sw_if_index (sw_if_index); + if (pvtii != INDEX_INVALID) + pvti_if_send_details (pvtii, &ctx); + } +} + +static void +vl_api_pvti_interface_create_t_handler (vl_api_pvti_interface_create_t *mp) +{ + vl_api_pvti_interface_create_reply_t *rmp; + pvti_main_t *pvm = &pvti_main; + int rv = ~0; + u32 sw_if_index = ~0; + ip_address_t local_ip; + ip_address_t remote_ip; + + ip_address_decode2 (&mp->interface.local_ip, &local_ip); + ip_address_decode2 (&mp->interface.remote_ip, &remote_ip); + u16 lport = clib_host_to_net_u16 (mp->interface.local_port); + u16 rport = clib_host_to_net_u16 (mp->interface.remote_port); + u16 underlay_mtu = clib_host_to_net_u16 (mp->interface.underlay_mtu); + u32 underlay_fib_index = + clib_host_to_net_u32 (mp->interface.underlay_fib_index); + pvti_peer_address_method_t peer_address_method = + mp->interface.peer_address_from_payload ? PVTI_PEER_ADDRESS_FROM_PAYLOAD : + PVTI_PEER_ADDRESS_FIXED; + + if (underlay_mtu == 0) + { + underlay_mtu = 1500; + } + + rv = + pvti_if_create (&local_ip, lport, &remote_ip, rport, peer_address_method, + underlay_mtu, underlay_fib_index, &sw_if_index); + + REPLY_MACRO2 (VL_API_PVTI_INTERFACE_CREATE_REPLY, + { rmp->sw_if_index = htonl (sw_if_index); }); +} + +static void +vl_api_pvti_interface_delete_t_handler (vl_api_pvti_interface_delete_t *mp) +{ + vl_api_pvti_interface_delete_reply_t *rmp; + pvti_main_t *pvm = &pvti_main; + int rv = 0; + + rv = pvti_if_delete (ntohl (mp->sw_if_index)); + REPLY_MACRO (VL_API_PVTI_INTERFACE_DELETE_REPLY); +} + +/* API definitions */ +#include <pvti/pvti.api.c> + +void +pvti_api_init () +{ + pvti_main_t *pvm = &pvti_main; + /* Add our API messages to the global name_crc hash table */ + pvm->msg_id_base = setup_message_id_table (); +} diff --git a/src/plugins/pvti/bypass-main.c b/src/plugins/pvti/bypass-main.c new file mode 100644 index 00000000000..db79ccd2113 --- /dev/null +++ b/src/plugins/pvti/bypass-main.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <pvti/bypass.h> + +/* packet trace format function */ +static u8 * +format_pvti_bypass_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + pvti_bypass_trace_t *t = va_arg (*args, pvti_bypass_trace_t *); + + s = format (s, "PVTI-BYPASS: sw_if_index %d, next index %d\n", + t->sw_if_index, t->next_index); + s = format (s, " src %U sport %d dport %d\n", format_ip_address, + &t->remote_ip, t->remote_port, t->local_port); + s = format (s, " seq: %d", t->seq); + return s; +} + +vlib_node_registration_t pvti4_bypass_node; +vlib_node_registration_t pvti6_bypass_node; + +static char *pvti_bypass_error_strings[] = { +#define _(sym, string) string, + foreach_pvti_bypass_error +#undef _ +}; + +VLIB_REGISTER_NODE (pvti4_bypass_node) = +{ + .name = "ip4-pvti-bypass", + .vector_size = sizeof (u32), + .format_trace = format_pvti_bypass_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(pvti_bypass_error_strings), + .error_strings = pvti_bypass_error_strings, + + .n_next_nodes = PVTI_BYPASS_N_NEXT, + + .next_nodes = { + [PVTI_BYPASS_NEXT_DROP] = "error-drop", + [PVTI_BYPASS_NEXT_PVTI_INPUT] = "pvti4-input", + }, + +}; + +VLIB_REGISTER_NODE (pvti6_bypass_node) = +{ + .name = "ip6-pvti-bypass", + .vector_size = sizeof (u32), + .format_trace = format_pvti_bypass_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(pvti_bypass_error_strings), + .error_strings = pvti_bypass_error_strings, + + .n_next_nodes = PVTI_BYPASS_N_NEXT, + + .next_nodes = { + [PVTI_BYPASS_NEXT_DROP] = "error-drop", + [PVTI_BYPASS_NEXT_PVTI_INPUT] = "pvti6-input", + }, + +}; diff --git a/src/plugins/pvti/bypass.c b/src/plugins/pvti/bypass.c new file mode 100644 index 00000000000..14c976439eb --- /dev/null +++ b/src/plugins/pvti/bypass.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> +#include <pvti/bypass.h> + +always_inline u16 +pvti_bypass_node_common (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame, bool is_ip6) +{ + u32 n_left_from, *from, *to_next; + pvti_bypass_next_t next_index; + vlib_node_runtime_t *error_node = + vlib_node_get_runtime (vm, ip4_input_node.index); + + u32 pkts_processed = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + u32 sw_if_index0 = 0; + ip4_header_t *ip40; + ip6_header_t *ip60; + udp_header_t *udp0; + u32 bi0, ip_len0, udp_len0, flags0, next0; + u8 error0, good_udp0, proto0; + i32 len_diff0; + + bi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* setup the packet for the next feature */ + vnet_feature_next (&next0, b0); + + if (is_ip6) + { + ip60 = vlib_buffer_get_current (b0); + } + else + { + ip40 = vlib_buffer_get_current (b0); + } + + if (is_ip6) + { + proto0 = ip60->protocol; + } + else + { + /* Treat IP frag packets as "experimental" protocol for now */ + proto0 = ip4_is_fragment (ip40) ? 0xfe : ip40->protocol; + } + + /* Process packet 0 */ + if (proto0 != IP_PROTOCOL_UDP) + goto exit; /* not UDP packet */ + + if (is_ip6) + udp0 = ip6_next_header (ip60); + else + udp0 = ip4_next_header (ip40); + + /* look up the destination ip and port */ + u32 pvti_index0 = INDEX_INVALID; + if (is_ip6) + { + pvti_index0 = pvti_if_find_by_remote_ip6_and_port ( + &ip60->src_address, clib_net_to_host_u16 (udp0->src_port)); + } + else + { + pvti_index0 = pvti_if_find_by_remote_ip4_and_port ( + &ip40->src_address, clib_net_to_host_u16 (udp0->src_port)); + } + if (pvti_index0 == INDEX_INVALID) + goto exit; + + flags0 = b0->flags; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + + /* Don't verify UDP checksum for packets with explicit zero checksum. + */ + good_udp0 |= udp0->checksum == 0; + + /* Verify UDP length */ + if (is_ip6) + ip_len0 = clib_net_to_host_u16 (ip60->payload_length); + else + ip_len0 = clib_net_to_host_u16 (ip40->length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + len_diff0 = ip_len0 - udp_len0; + + /* Verify UDP checksum */ + if (PREDICT_FALSE (!good_udp0)) + { + if (is_ip6) + flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); + else + flags0 = ip4_tcp_udp_validate_checksum (vm, b0); + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + } + + if (is_ip6) + { + error0 = good_udp0 ? 0 : IP6_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP6_ERROR_UDP_LENGTH; + } + else + { + error0 = good_udp0 ? 0 : IP4_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP4_ERROR_UDP_LENGTH; + } + + next0 = error0 ? PVTI_BYPASS_NEXT_DROP : PVTI_BYPASS_NEXT_PVTI_INPUT; + b0->error = error0 ? error_node->errors[error0] : 0; + + /* pvtiX-input node expect current at PVTI header */ + if (is_ip6) + vlib_buffer_advance (b0, sizeof (ip6_header_t) + + sizeof (udp_header_t)); + else + vlib_buffer_advance (b0, sizeof (ip4_header_t) + + sizeof (udp_header_t)); + exit: + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + pvti_bypass_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->seq = 0; // clib_net_to_host_u32 (pvti0->seq); + if (is_ip6) + { + } + else + { + t->remote_ip.ip.ip4 = ip40->src_address; + t->remote_ip.version = AF_IP4; + } + // t->local_port = h0->udp.dst_port; + // t->remote_port = h0->udp.src_port; + } + + pkts_processed += 1; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, node->node_index, + PVTI_BYPASS_ERROR_PROCESSED, pkts_processed); + return frame->n_vectors; +} + +VLIB_NODE_FN (pvti4_bypass_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_bypass_node_common (vm, node, frame, 0); +} + +VLIB_NODE_FN (pvti6_bypass_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_bypass_node_common (vm, node, frame, 1); +} diff --git a/src/plugins/pvti/bypass.h b/src/plugins/pvti/bypass.h new file mode 100644 index 00000000000..611d5770ad3 --- /dev/null +++ b/src/plugins/pvti/bypass.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_pvti_bypass_h__ +#define __included_pvti_bypass_h__ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +typedef struct +{ + u32 next_index; + u32 sw_if_index; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + u32 seq; +} pvti_bypass_trace_t; + +#define foreach_pvti_bypass_error \ + _ (PROCESSED, "PVTI bypass tunnel packets processed") + +typedef enum +{ +#define _(sym, str) PVTI_BYPASS_ERROR_##sym, + foreach_pvti_bypass_error +#undef _ + PVTI_BYPASS_N_ERROR, +} pvti_bypass_error_t; + +typedef enum +{ + PVTI_BYPASS_NEXT_DROP, + PVTI_BYPASS_NEXT_PVTI_INPUT, + PVTI_BYPASS_N_NEXT, +} pvti_bypass_next_t; + +#endif // pvti_bypass_h diff --git a/src/plugins/pvti/input-main.c b/src/plugins/pvti/input-main.c new file mode 100644 index 00000000000..8ab8b18dd7c --- /dev/null +++ b/src/plugins/pvti/input-main.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <pvti/input.h> + +static char *pvti_input_error_strings[] = { +#define _(sym, string) string, + foreach_pvti_input_error +#undef _ +}; + +#define _(f, s) s, +static char *pvti_input_trace_type_names[] = { foreach_pvti_input_trace_type }; +#undef _ + +static char * +get_pvti_trace_type_name (u8 ptype) +{ + if (ptype < PVTI_INPUT_TRACE_N_TYPES) + { + return pvti_input_trace_type_names[ptype]; + } + else + { + return "unknown"; + } +} + +/* packet trace format function */ +static u8 * +format_pvti_input_trace (u8 *s, va_list *args) +{ + int i; + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + pvti_input_trace_t *t = va_arg (*args, pvti_input_trace_t *); + + u32 indent = format_get_indent (s); + + s = format (s, + "PVTI-IN: sw_if_index %d, next index %d, trace_type: %s(%d), " + "chunkcnt: %d\n", + t->sw_if_index, t->next_index, + get_pvti_trace_type_name (t->trace_type), t->trace_type, + t->chunk_count); + s = format (s, " src %U sport %d dport %d\n", format_ip_address, + &t->remote_ip, t->remote_port, t->local_port); + s = format (s, " seq: %d, chunk_count: %d\n", t->seq, t->chunk_count); + u16 max = t->chunk_count > MAX_CHUNKS ? MAX_CHUNKS : t->chunk_count; + for (i = 0; i < max; i++) + { + s = format (s, " %02d: sz %d\n", i, t->chunks[i].total_chunk_length); + } + s = format (s, "\n%U%U", format_white_space, indent, + format_ip_adjacency_packet_data, t->packet_data, + sizeof (t->packet_data)); + + return s; +} + +vlib_node_registration_t pvti4_input_node; +vlib_node_registration_t pvti6_input_node; + +VLIB_REGISTER_NODE (pvti4_input_node) = +{ + .name = "pvti4-input", + .vector_size = sizeof (u32), + .format_trace = format_pvti_input_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(pvti_input_error_strings), + .error_strings = pvti_input_error_strings, + + .n_next_nodes = PVTI_INPUT_N_NEXT, + + .next_nodes = { + [PVTI_INPUT_NEXT_DROP] = "error-drop", + [PVTI_INPUT_NEXT_IP4_INPUT] = "ip4-input-no-checksum", + [PVTI_INPUT_NEXT_IP6_INPUT] = "ip6-input", + [PVTI_INPUT_NEXT_PUNT] = "error-punt", + }, + +}; +VLIB_REGISTER_NODE (pvti6_input_node) = +{ + .name = "pvti6-input", + .vector_size = sizeof (u32), + .format_trace = format_pvti_input_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(pvti_input_error_strings), + .error_strings = pvti_input_error_strings, + + .n_next_nodes = PVTI_INPUT_N_NEXT, + + .next_nodes = { + [PVTI_INPUT_NEXT_DROP] = "error-drop", + [PVTI_INPUT_NEXT_IP4_INPUT] = "ip4-input-no-checksum", + [PVTI_INPUT_NEXT_IP6_INPUT] = "ip6-input", + [PVTI_INPUT_NEXT_PUNT] = "error-punt", + }, + +}; diff --git a/src/plugins/pvti/input.c b/src/plugins/pvti/input.c new file mode 100644 index 00000000000..6a8806e2795 --- /dev/null +++ b/src/plugins/pvti/input.c @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> +#include <pvti/input.h> + +always_inline void +pvti_enqueue_rx_bi_to_next_and_trace (vlib_main_t *vm, + vlib_node_runtime_t *node, + pvti_per_thread_data_t *ptd, u32 bi0, + u16 next0) +{ + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + + if (PREDICT_TRUE (vlib_trace_buffer (vm, node, next0, b0, + /* follow_chain */ 0))) + { + pvti_input_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + t->trace_type = PVTI_INPUT_TRACE_decap; + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + vec_add1 (ptd->pending_rx_buffers, bi0); + vec_add1 (ptd->pending_rx_nexts, next0); +} + +always_inline pvti_rx_peer_t * +pvti_try_find_or_create_rx_peer (pvti_per_thread_data_t *ptd, + vlib_buffer_t *b0, bool is_ip6) +{ + pvti_rx_peer_t *peer; + + ip_address_t remote_ip = { 0 }; + u16 remote_port; + if (is_ip6) + { + pvti_ip6_encap_header_t *h0 = + ((pvti_ip6_encap_header_t *) vlib_buffer_get_current (b0)) - 1; + ip_address_set (&remote_ip, &h0->ip6.src_address, AF_IP6); + remote_port = clib_net_to_host_u16 (h0->udp.src_port); + } + else + { + pvti_ip4_encap_header_t *h0 = + ((pvti_ip4_encap_header_t *) vlib_buffer_get_current (b0)) - 1; + ip_address_set (&remote_ip, &h0->ip4.src_address, AF_IP4); + remote_port = clib_net_to_host_u16 (h0->udp.src_port); + } + + pool_foreach (peer, ptd->rx_peers) + { + if (peer->remote_port == remote_port && + 0 == ip_address_cmp (&remote_ip, &peer->remote_ip)) + { + if (peer->deleted) + { + // The peer has been marked as deleted - wipe it. + clib_memset (peer, 0xca, sizeof (*peer)); + pool_put (ptd->rx_peers, peer); + continue; + } + return peer; + } + } + + index_t pvti_if_index0 = + pvti_if_find_by_remote_ip_and_port (&remote_ip, remote_port); + if (INDEX_INVALID == pvti_if_index0) + { + // no suitable interface found, bail + return 0; + } + pvti_if_t *pvti_if0 = pvti_if_get (pvti_if_index0); + + pvti_rx_peer_t new_peer = { + .local_ip = pvti_if0->local_ip, + .local_port = pvti_if0->local_port, + .remote_ip = remote_ip, + .remote_port = remote_port, + .pvti_if_index = pvti_if_index0, + .rx_streams = { { 0 } }, + }; + pvti_rx_peer_t *rx_new_peer; + pool_get (ptd->rx_peers, rx_new_peer); + *rx_new_peer = new_peer; + + int i; + for (i = 0; i < MAX_RX_STREAMS; i++) + { + rx_new_peer->rx_streams[i].rx_bi0 = INDEX_INVALID; + rx_new_peer->rx_streams[i].rx_bi0_first = INDEX_INVALID; + rx_new_peer->rx_streams[i].rx_next0 = 0; + } + + return rx_new_peer; +} + +always_inline u16 +pvti_input_node_common (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame, bool is_ip6) +{ + u32 n_left_from, *from; + pvti_chunk_header_t *chunks[MAX_CHUNKS]; + u32 pkts_processed = 0; + u32 pkts_decapsulated = 0; + u32 decap_failed_no_buffers = 0; + + pvti_main_t *pvm = &pvti_main; + + u32 thread_index = vlib_get_thread_index (); + pvti_per_thread_data_t *ptd = + vec_elt_at_index (pvm->per_thread_data[is_ip6], thread_index); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + while (n_left_from > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = PVTI_INPUT_NEXT_DROP; + u32 sw_if_index0; + u8 true_chunk_count = 0; + u8 max_chunk_count; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + + b0 = vlib_get_buffer (vm, bi0); + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + pvti_ip4_encap_header_t *h0 = + ((pvti_ip4_encap_header_t *) vlib_buffer_get_current (b0)) - 1; + pvti_rx_peer_t *pvti_rx_peer0 = + pvti_try_find_or_create_rx_peer (ptd, b0, is_ip6); + if (!pvti_rx_peer0) + { + b0->error = node->errors[PVTI_INPUT_ERROR_PEER]; + goto drop_and_maybe_trace; + } + + b0 = vlib_get_buffer (vm, bi0); + pvti_packet_header_t *pvti0 = vlib_buffer_get_current (b0); + u8 stream_index = pvti0->stream_index; + max_chunk_count = + pvti0->chunk_count < MAX_CHUNKS ? pvti0->chunk_count : MAX_CHUNKS; + u16 pvti_packet_header_sz0 = + pvti0->pad_bytes + offsetof (pvti_packet_header_t, pad); + if (b0->current_length < pvti_packet_header_sz0) + { + b0->error = node->errors[PVTI_INPUT_ERROR_PACKET_TOO_SHORT]; + goto drop_and_maybe_trace; + } + vlib_buffer_advance (b0, pvti_packet_header_sz0); + + if (max_chunk_count == 0) + { + b0->error = node->errors[PVTI_INPUT_ERROR_NOCHUNKS]; + goto drop_and_maybe_trace; + } + if (pvti0->reass_chunk_count > max_chunk_count) + { + b0->error = node->errors[PVTI_INPUT_ERROR_TOOMANYREASS]; + goto drop_and_maybe_trace; + } + pvti_per_rx_stream_data_t *rx_stream0 = + &pvti_rx_peer0->rx_streams[stream_index]; + + u32 new_seq0 = clib_net_to_host_u32 (pvti0->seq); + if (new_seq0 == rx_stream0->last_rx_seq + 1) + { + /* Sequence# matches, we can attempt adding the leading chunks to + * reassembly */ + rx_stream0->last_rx_seq = new_seq0; + + while ((b0->current_length > 0) && + true_chunk_count < pvti0->reass_chunk_count) + { + /* attempt to either incorporate the first chunk into + * reassembly or skip it. */ + pvti_chunk_header_t *pvc0 = vlib_buffer_get_current (b0); + const u16 chunk_payload_length = + clib_net_to_host_u16 (pvc0->total_chunk_length) - + sizeof (*pvc0); + vlib_buffer_advance (b0, sizeof (*pvc0)); + + if (rx_stream0->rx_bi0 == INDEX_INVALID) + { + clib_warning ( + "RX internal error: not-first chunk but no wip block"); + } + else + { + + vlib_buffer_t *rb0 = + vlib_get_buffer (vm, rx_stream0->rx_bi0); + u16 allowed_length = + PVTI_RX_MAX_LENGTH - rb0->current_length; + if (allowed_length > chunk_payload_length) + { + // simple case - there is space in the buffer to fit + // the whole chunk + void *tail = + vlib_buffer_put_uninit (rb0, chunk_payload_length); + clib_memcpy (tail, vlib_buffer_get_current (b0), + chunk_payload_length); + } + else + { + // The current chunk can not fit - need to make two + // copies, one into the current buffer, and one into + // a newly allocated chained buffer. + void *tail = + vlib_buffer_put_uninit (rb0, allowed_length); + clib_memcpy (tail, vlib_buffer_get_current (b0), + allowed_length); + u16 remaining_payload_length = + chunk_payload_length - allowed_length; + u32 nrbi0 = pvti_get_new_buffer (vm); + if (INDEX_INVALID == nrbi0) + { + ASSERT (0); // FIXME what the recovery is + // supposed to look like ? + } + else + { + // link up the new buffer and copy the remainder + // there + vlib_buffer_t *nrb0 = vlib_get_buffer (vm, nrbi0); + rb0->flags |= VLIB_BUFFER_NEXT_PRESENT; + rb0->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + rb0->next_buffer = nrbi0; + rx_stream0->rx_bi0 = nrbi0; + void *tail = vlib_buffer_put_uninit ( + nrb0, remaining_payload_length); + clib_memcpy (tail, + vlib_buffer_get_current (b0) + + allowed_length, + remaining_payload_length); + } + } + pvti_rx_peer0->rx_streams[stream_index] + .rx_received_inner_length += chunk_payload_length; + if (pvti_rx_peer0->rx_streams[stream_index] + .rx_received_inner_length == + pvti_rx_peer0->rx_streams[stream_index] + .rx_expected_inner_length) + { + next0 = rx_stream0->rx_next0; + pvti_enqueue_rx_bi_to_next_and_trace ( + vm, node, ptd, rx_stream0->rx_bi0_first, next0); + pkts_decapsulated += 1; + + // clean out the current reassemly state + rx_stream0->rx_bi0 = INDEX_INVALID; + rx_stream0->rx_bi0_first = INDEX_INVALID; + pvti_rx_peer0->rx_streams[stream_index] + .rx_received_inner_length = 0; + pvti_rx_peer0->rx_streams[stream_index] + .rx_expected_inner_length = 0; + rx_stream0->rx_next0 = 0; + } + } + chunks[true_chunk_count] = pvc0; + true_chunk_count += 1; + vlib_buffer_advance (b0, chunk_payload_length); + } + } + else + { + /* Sequence does not match, skip the reassembly chunks and reset + * the reassembly state */ + + while ((b0->current_length > 0) && + true_chunk_count < pvti0->reass_chunk_count) + { + /* skip the reassembly chunks */ + pvti_chunk_header_t *pvc0 = vlib_buffer_get_current (b0); + chunks[true_chunk_count] = pvc0; + true_chunk_count += 1; + vlib_buffer_advance ( + b0, clib_net_to_host_u16 (pvc0->total_chunk_length)); + } + // FIXME: discard the current reassembly state, reset the seq# + if (rx_stream0->rx_bi0_first != INDEX_INVALID) + { + clib_warning ("RX PVTI: discard chunk being reassembled"); + vlib_buffer_free_one (vm, rx_stream0->rx_bi0_first); + rx_stream0->rx_bi0 = INDEX_INVALID; + rx_stream0->rx_bi0_first = INDEX_INVALID; + rx_stream0->rx_received_inner_length = 0; + rx_stream0->rx_expected_inner_length = 0; + rx_stream0->rx_next0 = 0; + } + } + + while ((b0->current_length > 0) && true_chunk_count < max_chunk_count) + { + if (b0->current_length < sizeof (pvti_chunk_header_t)) + { + clib_warning ("RX ERR: length too short for a chunk"); + break; + } + pvti_chunk_header_t *pvc0 = vlib_buffer_get_current (b0); + chunks[true_chunk_count] = pvc0; + true_chunk_count += 1; + u16 total_chunk_length = + clib_net_to_host_u16 (pvc0->total_chunk_length); + if (b0->current_length < total_chunk_length) + { + clib_warning ("RX ERR: length 0x%x too big for a chunk", + true_chunk_count); + break; + } + u8 *pkt = (u8 *) (pvc0 + 1); + u16 inner_length; + if (rx_stream0->rx_bi0_first != INDEX_INVALID) + { + vlib_buffer_free_one (vm, rx_stream0->rx_bi0_first); + rx_stream0->rx_bi0 = INDEX_INVALID; + rx_stream0->rx_bi0_first = INDEX_INVALID; + rx_stream0->rx_received_inner_length = 0; + rx_stream0->rx_expected_inner_length = 0; + rx_stream0->rx_next0 = 0; + } + + switch (*pkt & 0xf0) + { + case 0x40: + next0 = PVTI_INPUT_NEXT_IP4_INPUT; + inner_length = clib_net_to_host_u16 (*((u16 *) (pkt + 2))); + break; + case 0x60: + next0 = PVTI_INPUT_NEXT_IP6_INPUT; + inner_length = clib_net_to_host_u16 (*((u16 *) (pkt + 4))) + + sizeof (ip6_header_t); + break; + default: + next0 = PVTI_INPUT_NEXT_DROP; + vlib_buffer_advance (b0, total_chunk_length); + continue; + } + vlib_buffer_advance (b0, sizeof (pvti_chunk_header_t)); + + if (inner_length + sizeof (pvti_chunk_header_t) > total_chunk_length) + { + /* FIXME: the packet size is larger than the chunk -> it's a + * first fragment */ + // enqueue the chunk and finish packet processing. + // There must be no active reassembly. + ASSERT (rx_stream0->rx_bi0_first == INDEX_INVALID); + rx_stream0->rx_next0 = next0; + rx_stream0->rx_bi0 = bi0; + rx_stream0->rx_bi0_first = bi0; + rx_stream0->rx_expected_inner_length = inner_length; + rx_stream0->rx_received_inner_length = + total_chunk_length - sizeof (pvti_chunk_header_t); + rx_stream0->last_rx_seq = new_seq0; + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + pvti_input_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = ~0; + t->trace_type = PVTI_INPUT_TRACE_enqueue; + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + goto continue_outer; + } + + u32 nbi0 = pvti_get_new_buffer (vm); + if (INDEX_INVALID == nbi0) + { + decap_failed_no_buffers += 1; + continue; + }; + vlib_buffer_t *nb0 = vlib_get_buffer (vm, nbi0); + pvti_if_t *pvti_if0 = pvti_if_get (pvti_rx_peer0->pvti_if_index); + vnet_buffer (nb0)->sw_if_index[VLIB_RX] = pvti_if0->sw_if_index; + void *new_packet = vlib_buffer_put_uninit (nb0, inner_length); + clib_memcpy (new_packet, pvc0 + 1, inner_length); + vlib_buffer_advance (b0, inner_length); + + pvti_enqueue_rx_bi_to_next_and_trace (vm, node, ptd, nbi0, next0); + pkts_decapsulated += 1; + } + /* we have processed all the chunks from the buffer, but the buffer + * remains. Free it. */ + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + pvti_input_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = ~0; + t->trace_type = PVTI_INPUT_TRACE_free; + t->seq = clib_net_to_host_u32 (pvti0->seq); + t->chunk_count = pvti0->chunk_count; + u8 chunk_count = + pvti0->chunk_count < MAX_CHUNKS ? pvti0->chunk_count : MAX_CHUNKS; + for (int i = 0; i < chunk_count; i++) + { + t->chunks[i].total_chunk_length = + clib_net_to_host_u16 (chunks[i]->total_chunk_length); + } + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + vlib_buffer_free_one (vm, bi0); + + continue_outer: + pkts_processed += 1; + continue; + + drop_and_maybe_trace: + next0 = PVTI_INPUT_NEXT_DROP; + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + int i; + pvti_input_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->trace_type = PVTI_INPUT_TRACE_drop; + t->next_index = next0; + t->remote_ip.ip.ip4 = h0->ip4.src_address; + t->remote_ip.version = AF_IP4; + t->local_port = h0->udp.dst_port; + t->remote_port = h0->udp.src_port; + if (!pvti_rx_peer0) + { + t->seq = 0xdeaddead; + } + else + { + t->seq = clib_net_to_host_u32 (pvti0->seq); + t->chunk_count = pvti0->chunk_count; + u8 chunk_count = pvti0->chunk_count < MAX_CHUNKS ? + pvti0->chunk_count : + MAX_CHUNKS; + for (i = 0; i < chunk_count; i++) + { + t->chunks[i].total_chunk_length = + clib_net_to_host_u16 (chunks[i]->total_chunk_length); + } + } + } + + pkts_processed += 1; + vec_add1 (ptd->pending_rx_buffers, bi0); + vec_add1 (ptd->pending_rx_nexts, next0); + } + + vlib_buffer_enqueue_to_next_vec (vm, node, &ptd->pending_rx_buffers, + &ptd->pending_rx_nexts, + vec_len (ptd->pending_rx_nexts)); + vec_reset_length (ptd->pending_rx_buffers); + vec_reset_length (ptd->pending_rx_nexts); + + vlib_node_increment_counter (vm, node->node_index, + PVTI_INPUT_ERROR_PROCESSED, pkts_processed); + vlib_node_increment_counter ( + vm, node->node_index, PVTI_INPUT_ERROR_DECAPSULATED, pkts_decapsulated); + vlib_node_increment_counter (vm, node->node_index, + PVTI_INPUT_ERROR_NO_BUFFERS, + decap_failed_no_buffers); + return frame->n_vectors; +} + +VLIB_NODE_FN (pvti4_input_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_input_node_common (vm, node, frame, 0); +} + +VLIB_NODE_FN (pvti6_input_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_input_node_common (vm, node, frame, 1); +} diff --git a/src/plugins/pvti/input.h b/src/plugins/pvti/input.h new file mode 100644 index 00000000000..02a186cde05 --- /dev/null +++ b/src/plugins/pvti/input.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_pvti_input_h__ +#define __included_pvti_input_h__ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +typedef struct +{ + u16 total_chunk_length; +} pvti_input_chunk_t; + +#define MAX_CHUNKS 32 +#define PVTI_RX_MAX_LENGTH 2048 + +typedef struct +{ + u32 next_index; + u32 sw_if_index; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + u32 seq; + pvti_input_chunk_t chunks[MAX_CHUNKS]; + u8 chunk_count; + u8 trace_type; + u8 packet_data[64]; +} pvti_input_trace_t; + +#define foreach_pvti_input_trace_type \ + _ (drop, "drop") \ + _ (decap, "decapsulate") \ + _ (free, "free") \ + _ (enqueue, "enqueue") + +typedef enum +{ +#define _(f, s) PVTI_INPUT_TRACE_##f, + foreach_pvti_input_trace_type +#undef _ + PVTI_INPUT_TRACE_N_TYPES, +} pvti_input_trace_type_t; + +#define foreach_pvti_input_error \ + _ (PROCESSED, "PVTI tunneled packets processed") \ + _ (DECAPSULATED, "PVTI inner packets decapsulated") \ + _ (PEER, "Could not find a peer") \ + _ (NOCHUNKS, "Packet has no chunks") \ + _ (NO_BUFFERS, "No buffers available to decapsulate") \ + _ (TOOMANYREASS, "Packet has more reassembly chunks than total") \ + _ (PACKET_TOO_SHORT, "Packet too short") + +typedef enum +{ +#define _(sym, str) PVTI_INPUT_ERROR_##sym, + foreach_pvti_input_error +#undef _ + PVTI_INPUT_N_ERROR, +} pvti_input_error_t; + +typedef enum +{ + PVTI_INPUT_NEXT_DROP, + PVTI_INPUT_NEXT_IP4_INPUT, + PVTI_INPUT_NEXT_IP6_INPUT, + PVTI_INPUT_NEXT_PUNT, + PVTI_INPUT_N_NEXT, +} pvti_input_next_t; + +#endif // pvti_input_h diff --git a/src/plugins/pvti/output-main.c b/src/plugins/pvti/output-main.c new file mode 100644 index 00000000000..ae4ae5f8e98 --- /dev/null +++ b/src/plugins/pvti/output-main.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <pvti/output.h> + +/* packet trace format function */ +static u8 * +format_pvti_output_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + pvti_output_trace_t *t = va_arg (*args, pvti_output_trace_t *); + + u32 indent = format_get_indent (s); + s = + format (s, "PVTI-OUT(%d): sw_if_index %d, next index %d, underlay_mtu %d,", + t->trace_type, t->sw_if_index, t->next_index, t->underlay_mtu); + s = format (s, "\n%U stream_index %d, bi0_max_current_length %d, tx_seq %d", + format_white_space, indent, t->stream_index, + t->bi0_max_current_length, t->tx_seq); + s = format (s, "\n%U%U", format_white_space, indent, + format_ip_adjacency_packet_data, t->packet_data, + sizeof (t->packet_data)); + + return s; +} + +vlib_node_registration_t pvti_output_node; + +static char *pvti_output_error_strings[] = { +#define _(sym, string) string, + foreach_pvti_output_error +#undef _ +}; + +VLIB_REGISTER_NODE (pvti4_output_node) = +{ + .name = "pvti4-output", + .vector_size = sizeof (u32), + .format_trace = format_pvti_output_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(pvti_output_error_strings), + .error_strings = pvti_output_error_strings, + + .n_next_nodes = PVTI_OUTPUT_N_NEXT, + + .next_nodes = { + [PVTI_OUTPUT_NEXT_DROP] = "error-drop", + [PVTI_OUTPUT_NEXT_INTERFACE_OUTPUT] = "adj-midchain-tx", + [PVTI_OUTPUT_NEXT_IP4_LOOKUP] = "ip4-lookup", + [PVTI_OUTPUT_NEXT_IP6_LOOKUP] = "ip6-lookup", + }, + +}; +VLIB_REGISTER_NODE (pvti6_output_node) = +{ + .name = "pvti6-output", + .vector_size = sizeof (u32), + .format_trace = format_pvti_output_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(pvti_output_error_strings), + .error_strings = pvti_output_error_strings, + + .n_next_nodes = PVTI_OUTPUT_N_NEXT, + + .next_nodes = { + [PVTI_OUTPUT_NEXT_DROP] = "error-drop", + [PVTI_OUTPUT_NEXT_INTERFACE_OUTPUT] = "adj-midchain-tx", + [PVTI_OUTPUT_NEXT_IP4_LOOKUP] = "ip4-lookup", + [PVTI_OUTPUT_NEXT_IP6_LOOKUP] = "ip6-lookup", + }, + +}; diff --git a/src/plugins/pvti/output.c b/src/plugins/pvti/output.c new file mode 100644 index 00000000000..1939c6f585a --- /dev/null +++ b/src/plugins/pvti/output.c @@ -0,0 +1,543 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> +#include <pvti/output.h> + +static_always_inline u32 +ip6_vtcfl (u8 stream_index) +{ + u32 vtcfl = 0x6 << 28; + vtcfl |= stream_index; + + return (clib_host_to_net_u32 (vtcfl)); +} + +always_inline vlib_buffer_t * +pvti_alloc_new_tx_buffer (vlib_main_t *vm) +{ + u32 bi0 = INDEX_INVALID; + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + { + return 0; + } + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + b0->current_data = 0; + b0->current_length = 0; + return b0; +} + +always_inline bool +pvti_find_or_try_create_tx_peer (vlib_main_t *vm, pvti_per_thread_data_t *ptd, + pvti_if_t *pvti_if0, ip_address_t *remote_ip, + u16 remote_port, u32 *out_index) +{ + + pvti_tx_peer_t *peer; + pool_foreach (peer, ptd->tx_peers) + { + if (peer->remote_port == remote_port && + 0 == ip_address_cmp (remote_ip, &peer->remote_ip)) + { + if (peer->deleted) + { + // Bad luck, the peer has been deleted. + u32 boi0 = vlib_get_buffer_index (vm, peer->bo0); + if (peer->bo0) + { + vlib_buffer_free (vm, &boi0, 1); + } + clib_memset (peer, 0xca, sizeof (*peer)); + pool_put (ptd->tx_peers, peer); + continue; + } + *out_index = peer - ptd->tx_peers; + return 1; + } + } + + ip_address_family_t dst_ver = ip_addr_version (&pvti_if0->remote_ip); + + u16 pvti_encap_overhead = (dst_ver == AF_IP6) ? + sizeof (pvti_ip6_encap_header_t) : + sizeof (pvti_ip4_encap_header_t); + + u16 pvti_packet_overhead = + pvti_encap_overhead + sizeof (pvti_packet_header_t) + PVTI_ALIGN_BYTES; + + ASSERT (pvti_if0->underlay_mtu > pvti_packet_overhead); + + u32 bo0_max_current_length = pvti_if0->underlay_mtu - pvti_packet_overhead; + + vlib_buffer_t *bo0 = pvti_alloc_new_tx_buffer (vm); + + if (!bo0) + { + return 0; + } + + pvti_tx_peer_t new_peer = { + .local_ip = pvti_if0->local_ip, + .remote_ip = *remote_ip, + .local_port = pvti_if0->local_port, + .remote_port = remote_port, + .underlay_mtu = pvti_if0->underlay_mtu, + .underlay_fib_index = pvti_if0->underlay_fib_index, + .bo0_max_current_length = bo0_max_current_length, + .pvti_if_index = pvti_if_get_index (pvti_if0), + .deleted = 0, + .bo0 = bo0, + .chunk_count = 0, + .reass_chunk_count = 0, + .current_tx_seq = 42, + }; + + pvti_tx_peer_t *tx_new_peer; + pool_get (ptd->tx_peers, tx_new_peer); + + *tx_new_peer = new_peer; + *out_index = tx_new_peer - ptd->tx_peers; + return 1; +} + +always_inline bool +pvti_try_get_tx_peer_index (vlib_main_t *vm, pvti_per_thread_data_t *ptd, + pvti_if_t *pvti_if0, vlib_buffer_t *b0, + bool is_ip6, u32 *out_index) +{ + if (pvti_if0->peer_address_from_payload) + { + ip_address_t remote_ip = { 0 }; + if (is_ip6) + { + ip6_header_t *ip6 = vlib_buffer_get_current (b0); + ip_address_set (&remote_ip, &ip6->dst_address, AF_IP6); + } + else + { + ip4_header_t *ip4 = vlib_buffer_get_current (b0); + ip_address_set (&remote_ip, &ip4->dst_address, AF_IP4); + } + return pvti_find_or_try_create_tx_peer ( + vm, ptd, pvti_if0, &remote_ip, pvti_if0->remote_port, out_index); + } + else + { + return pvti_find_or_try_create_tx_peer ( + vm, ptd, pvti_if0, &pvti_if0->remote_ip, pvti_if0->remote_port, + out_index); + } + /* not reached */ +} + +always_inline void +pvti_finalize_chunk (pvti_tx_peer_t *tx_peer, + pvti_chunk_header_t *chunk_header, u8 *tail, + bool is_reassembly_chunk) +{ + clib_memset (chunk_header, 0xab, sizeof (pvti_chunk_header_t)); + chunk_header->total_chunk_length = + clib_host_to_net_u16 (tail - (u8 *) chunk_header); + tx_peer->chunk_count++; + if (is_reassembly_chunk) + { + tx_peer->reass_chunk_count++; + } +} + +always_inline pvti_output_next_t +encap_pvti_buffer_ip46 (vlib_main_t *vm, vlib_node_runtime_t *node, + pvti_tx_peer_t *tx_peer, int is_ip6) +{ + ip_address_family_t src_ver = ip_addr_version (&tx_peer->local_ip); + ip_address_family_t dst_ver = ip_addr_version (&tx_peer->remote_ip); + u8 stream_index = 0; + + ASSERT (src_ver == dst_ver); + bool is_ip6_encap = (AF_IP6 == src_ver); + + vlib_buffer_t *b0 = tx_peer->bo0; + vlib_buffer_advance (b0, + -(sizeof (pvti_packet_header_t) + PVTI_ALIGN_BYTES)); + + pvti_packet_header_t *pvti0 = vlib_buffer_get_current (b0); + clib_memset (pvti0, 0xca, sizeof (*pvti0) + PVTI_ALIGN_BYTES); + pvti0->pad_bytes = PVTI_ALIGN_BYTES; + + pvti0->seq = clib_host_to_net_u32 (tx_peer->current_tx_seq); + pvti0->stream_index = stream_index; + pvti0->reass_chunk_count = tx_peer->reass_chunk_count; + pvti0->chunk_count = tx_peer->chunk_count; + pvti0->mandatory_flags_mask = 0; + pvti0->flags_value = 0; + + if (is_ip6_encap) + { + vlib_buffer_advance (b0, -(sizeof (pvti_ip6_encap_header_t))); + if (b0->current_data < -VLIB_BUFFER_PRE_DATA_SIZE) + { + // undo the change + vlib_buffer_advance (b0, (sizeof (pvti_ip6_encap_header_t))); + b0->error = node->errors[PVTI_OUTPUT_ERROR_NO_PRE_SPACE]; + return PVTI_OUTPUT_NEXT_DROP; + } + pvti_ip6_encap_header_t *ve = vlib_buffer_get_current (b0); + + ve->udp.src_port = clib_host_to_net_u16 (tx_peer->local_port); + ve->udp.dst_port = clib_host_to_net_u16 (tx_peer->remote_port); + ve->udp.length = clib_host_to_net_u16 ( + b0->current_length - offsetof (pvti_ip6_encap_header_t, udp)); + ve->udp.checksum = 0; + + ve->ip6.ip_version_traffic_class_and_flow_label = + ip6_vtcfl (stream_index); + ve->ip6.payload_length = ve->udp.length; + ve->ip6.protocol = 17; + ve->ip6.hop_limit = 128; + ip_address_copy_addr (&ve->ip6.src_address, &tx_peer->local_ip); + ip_address_copy_addr (&ve->ip6.dst_address, &tx_peer->remote_ip); + } + else + { + vlib_buffer_advance (b0, -(sizeof (pvti_ip4_encap_header_t))); + if (b0->current_data < -VLIB_BUFFER_PRE_DATA_SIZE) + { + // undo the change + vlib_buffer_advance (b0, (sizeof (pvti_ip4_encap_header_t))); + b0->error = node->errors[PVTI_OUTPUT_ERROR_NO_PRE_SPACE]; + return PVTI_OUTPUT_NEXT_DROP; + } + pvti_ip4_encap_header_t *ve = vlib_buffer_get_current (b0); + + ve->udp.src_port = clib_host_to_net_u16 (tx_peer->local_port); + ve->udp.dst_port = clib_host_to_net_u16 (tx_peer->remote_port); + ve->udp.length = clib_host_to_net_u16 ( + b0->current_length - offsetof (pvti_ip4_encap_header_t, udp)); + ve->udp.checksum = 0; + + ve->ip4.ip_version_and_header_length = 0x45; + ve->ip4.tos = 0; + ve->ip4.length = clib_host_to_net_u16 (b0->current_length); + ve->ip4.fragment_id = + clib_host_to_net_u16 (tx_peer->current_tx_seq & 0xffff); + ve->ip4.flags_and_fragment_offset = 0; + ve->ip4.ttl = 128; + ve->ip4.protocol = 17; + + ve->ip4.dst_address.as_u32 = ip_addr_v4 (&tx_peer->remote_ip).data_u32; + ve->ip4.src_address.as_u32 = ip_addr_v4 (&tx_peer->local_ip).data_u32; + ve->ip4.checksum = ip4_header_checksum (&ve->ip4); + } + + // This is important, if not reset, causes a crash + vnet_buffer (b0)->sw_if_index[VLIB_TX] = tx_peer->underlay_fib_index; + + // vnet_buffer (b0)->oflags |= VNET_BUFFER_OFFLOAD_F_IP_CKSUM; + return is_ip6_encap ? PVTI_OUTPUT_NEXT_IP6_LOOKUP : + PVTI_OUTPUT_NEXT_IP4_LOOKUP; +} + +always_inline void +pvti_enqueue_tx_and_trace (vlib_main_t *vm, vlib_node_runtime_t *node, + pvti_per_thread_data_t *ptd, vlib_buffer_t *b0, + u16 next0, u8 stream_index, pvti_tx_peer_t *tx_peer) +{ + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + tx_peer->is_bo0_traced)) + { + if (PREDICT_TRUE ( + vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0))) + { + + pvti_output_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + t->underlay_mtu = tx_peer->underlay_mtu; + t->stream_index = stream_index; + t->trace_type = 1; + t->bi0_max_current_length = tx_peer->bo0_max_current_length; + t->tx_seq = tx_peer->current_tx_seq; + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + } + u32 bi0 = vlib_get_buffer_index (vm, b0); + vec_add1 (ptd->pending_tx_buffers, bi0); + vec_add1 (ptd->pending_tx_nexts, next0); +} + +always_inline void +pvti_enqueue_tx_drop_and_trace (vlib_main_t *vm, vlib_node_runtime_t *node, + pvti_per_thread_data_t *ptd, vlib_buffer_t *b0, + u8 stream_index) +{ + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + pvti_output_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = PVTI_OUTPUT_NEXT_DROP; + t->stream_index = stream_index; + t->trace_type = 0; + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + u32 bi0 = vlib_get_buffer_index (vm, b0); + vec_add1 (ptd->pending_tx_buffers, bi0); + vec_add1 (ptd->pending_tx_nexts, PVTI_OUTPUT_NEXT_DROP); +} + +always_inline bool +pvti_flush_peer_and_recharge (vlib_main_t *vm, vlib_node_runtime_t *node, + pvti_per_thread_data_t *ptd, u32 tx_peer_index, + u8 stream_index, const bool is_ip6) +{ + pvti_tx_peer_t *tx_peer = pool_elt_at_index (ptd->tx_peers, tx_peer_index); + u16 next0 = encap_pvti_buffer_ip46 (vm, node, tx_peer, is_ip6); + + pvti_enqueue_tx_and_trace (vm, node, ptd, tx_peer->bo0, next0, stream_index, + tx_peer); + + tx_peer->bo0 = pvti_alloc_new_tx_buffer (vm); + tx_peer->reass_chunk_count = 0; + tx_peer->chunk_count = 0; + tx_peer->current_tx_seq++; + + return 1; +} + +always_inline u16 +pvti_output_node_common (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame, const bool is_ip6) +{ + pvti_main_t *pvm = &pvti_main; + + u32 n_left_from, *from; + u32 pkts_encapsulated = 0; + u32 pkts_processed = 0; + u32 pkts_chopped = 0; + u32 pkts_overflow = 0; + u32 pkts_overflow_cantfit = 0; + + bool is_node_traced = (node->flags & VLIB_NODE_FLAG_TRACE) ? 1 : 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + u8 stream_index = pvti_get_stream_index (is_ip6); + + u32 thread_index = vlib_get_thread_index (); + pvti_per_thread_data_t *ptd = + vec_elt_at_index (pvm->per_thread_data[is_ip6], thread_index); + + vlib_buffer_t *ibufs[VLIB_FRAME_SIZE], **ib = ibufs; + + vlib_get_buffers (vm, from, ibufs, n_left_from); + + n_left_from = frame->n_vectors; + while (1 && n_left_from > 0) + { + n_left_from -= 1; + vlib_buffer_t *b0 = ib[0]; + ib++; + u32 bi0 = vlib_get_buffer_index (vm, b0); + bool is_b0_traced = + is_node_traced && ((b0->flags & VLIB_BUFFER_IS_TRACED) ? 1 : 0); + pkts_processed += 1; + + u32 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_TX]; + u32 pvti_index0 = pvti_if_find_by_sw_if_index (sw_if_index0); + if (pvti_index0 == INDEX_INVALID) + { + b0->error = node->errors[PVTI_OUTPUT_ERROR_PEER]; + pvti_enqueue_tx_drop_and_trace (vm, node, ptd, b0, stream_index); + continue; + } + pvti_if_t *pvti_if0 = pvti_if_get (pvti_index0); + u32 tx_peer_index; + if (!pvti_try_get_tx_peer_index (vm, ptd, pvti_if0, b0, is_ip6, + &tx_peer_index)) + { + b0->error = node->errors[PVTI_OUTPUT_ERROR_MAKE_PEER]; + pvti_enqueue_tx_drop_and_trace (vm, node, ptd, b0, stream_index); + continue; + } + pvti_tx_peer_t *tx_peer = &ptd->tx_peers[tx_peer_index]; + + u32 b0_len = vlib_buffer_length_in_chain (vm, b0); + u32 total_chunk_len = sizeof (pvti_chunk_header_t) + b0_len; + + if (tx_peer->bo0_max_current_length >= + tx_peer->bo0->current_length + total_chunk_len) + { + /* Happy case, we can fit the entire new chunk */ + pvti_chunk_header_t *chunk_header = vlib_buffer_put_uninit ( + tx_peer->bo0, sizeof (pvti_chunk_header_t)); + u8 *tail = vlib_buffer_put_uninit (tx_peer->bo0, b0_len); + vlib_buffer_t *b0_curr; + b0_curr = b0; + while (b0_len > 0) + { + clib_memcpy (tail, vlib_buffer_get_current (b0_curr), + b0_curr->current_length); + tail += b0_curr->current_length; + b0_len -= b0_curr->current_length; + ASSERT ((b0_len == 0) || + (b0_curr->flags & VLIB_BUFFER_NEXT_PRESENT)); + if (b0_curr->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b0_curr = vlib_get_buffer (vm, b0_curr->next_buffer); + } + } + tx_peer->is_bo0_traced |= is_b0_traced; + pvti_finalize_chunk (tx_peer, chunk_header, tail, false); + } + else + { + bool is_reassembly = false; + /* FIXME: here, flush a packet if we want to avoid fragmenting it */ +#define PVTI_TINY_PACKET_SZ 20 + int threshold_len = + sizeof (pvti_chunk_header_t) + PVTI_TINY_PACKET_SZ; + + /* Can we fit anything meaningful into bo0 ? if not - flush */ + if (tx_peer->bo0_max_current_length <= + tx_peer->bo0->current_length + threshold_len) + { + if (!pvti_flush_peer_and_recharge (vm, node, ptd, tx_peer_index, + stream_index, is_ip6)) + { + b0->error = node->errors[PVTI_OUTPUT_ERROR_RECHARGE0]; + pvti_enqueue_tx_drop_and_trace (vm, node, ptd, b0, + stream_index); + continue; + } + pkts_encapsulated += 1; + } + + pvti_chunk_header_t *chunk_header = vlib_buffer_put_uninit ( + tx_peer->bo0, sizeof (pvti_chunk_header_t)); + + u8 *tail; + vlib_buffer_t *b0_curr; + /* append the chained buffers and flush as necessary */ + b0_curr = b0; + + int curr_b0_start_offset = 0; + + while (b0_len > 0) + { + ASSERT (tx_peer->bo0_max_current_length > + tx_peer->bo0->current_length); + int copy_len = + clib_min (b0_curr->current_length - curr_b0_start_offset, + tx_peer->bo0_max_current_length - + tx_peer->bo0->current_length); + tail = vlib_buffer_put_uninit (tx_peer->bo0, copy_len); + clib_memcpy (tail, + (u8 *) vlib_buffer_get_current (b0_curr) + + curr_b0_start_offset, + copy_len); + tail += copy_len; + b0_len -= copy_len; + // Advance the start offset or reset it if we copied the entire + // block + curr_b0_start_offset = + curr_b0_start_offset + copy_len == b0_curr->current_length ? + 0 : + curr_b0_start_offset + copy_len; + ASSERT ((b0_len == 0) || (curr_b0_start_offset > 0) || + (b0_curr->flags & VLIB_BUFFER_NEXT_PRESENT)); + if (curr_b0_start_offset > 0) + { + pvti_finalize_chunk (tx_peer, chunk_header, tail, + is_reassembly); + tx_peer->is_bo0_traced |= is_b0_traced; + if (!pvti_flush_peer_and_recharge ( + vm, node, ptd, tx_peer_index, stream_index, is_ip6)) + { + b0->error = node->errors[PVTI_OUTPUT_ERROR_RECHARGE1]; + pvti_enqueue_tx_drop_and_trace (vm, node, ptd, b0, + stream_index); + continue; + } + pkts_encapsulated += 1; + /* next chunk(s) will be reassembly until the next block */ + is_reassembly = true; + chunk_header = vlib_buffer_put_uninit ( + tx_peer->bo0, sizeof (pvti_chunk_header_t)); + } + else + { + if ((b0_curr->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + b0_curr = vlib_get_buffer (vm, b0_curr->next_buffer); + } + else + { + pvti_finalize_chunk (tx_peer, chunk_header, tail, + is_reassembly); + tx_peer->is_bo0_traced |= is_b0_traced; + } + } + } + } + vlib_buffer_free_one (vm, bi0); + } + + int i; + for (i = 0; i < vec_len (ptd->tx_peers); i++) + { + if (ptd->tx_peers[i].chunk_count) + { + pvti_flush_peer_and_recharge (vm, node, ptd, i, stream_index, + is_ip6); + pkts_encapsulated += 1; + } + } + + vlib_buffer_enqueue_to_next_vec (vm, node, &ptd->pending_tx_buffers, + &ptd->pending_tx_nexts, + vec_len (ptd->pending_tx_nexts)); + vec_reset_length (ptd->pending_tx_buffers); + vec_reset_length (ptd->pending_tx_nexts); + + vlib_node_increment_counter ( + vm, node->node_index, PVTI_OUTPUT_ERROR_ENCAPSULATED, pkts_encapsulated); + vlib_node_increment_counter (vm, node->node_index, + PVTI_OUTPUT_ERROR_PROCESSED, pkts_processed); + vlib_node_increment_counter (vm, node->node_index, PVTI_OUTPUT_ERROR_CHOPPED, + pkts_chopped); + vlib_node_increment_counter (vm, node->node_index, + PVTI_OUTPUT_ERROR_OVERFLOW, pkts_overflow); + vlib_node_increment_counter (vm, node->node_index, + PVTI_OUTPUT_ERROR_OVERFLOW_CANTFIT, + pkts_overflow_cantfit); + return frame->n_vectors; +} + +VLIB_NODE_FN (pvti4_output_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_output_node_common (vm, node, frame, 0); +} + +VLIB_NODE_FN (pvti6_output_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_output_node_common (vm, node, frame, 1); +} diff --git a/src/plugins/pvti/output.h b/src/plugins/pvti/output.h new file mode 100644 index 00000000000..95e78ba9720 --- /dev/null +++ b/src/plugins/pvti/output.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_pvti_output_h__ +#define __included_pvti_output_h__ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +typedef struct +{ + u32 next_index; + u32 sw_if_index; + u32 tx_seq; + u16 underlay_mtu; + u16 bi0_max_current_length; + u8 stream_index; + u8 trace_type; + u8 packet_data[96]; +} pvti_output_trace_t; + +#define foreach_pvti_output_error \ + _ (NONE, "No error") \ + _ (PROCESSED, "Packets processed") \ + _ (ENCAPSULATED, "Packets encapsulated") \ + _ (PEER, "No peer found") \ + _ (MAKE_PEER, "Could not make peer") \ + _ (RECHARGE0, "Could not recharge 0") \ + _ (RECHARGE1, "Could not recharge 1") \ + _ (NO_PRE_SPACE, "Not enought pre-data space") \ + _ (CHOPPED, "Packets chopped") \ + _ (OVERFLOW, "Packets overflowed") \ + _ (OVERFLOW_CANTFIT, "Packets overflowed and cant fit excess") + +typedef enum +{ +#define _(sym, str) PVTI_OUTPUT_ERROR_##sym, + foreach_pvti_output_error +#undef _ + PVTI_OUTPUT_N_ERROR, +} pvti_output_error_t; + +typedef enum +{ + PVTI_INDEPENDENT_CHUNK = 0, + PVTI_REASS_CHUNK, +} pvti_chunk_type_t; + +#define MAX_CURR_LEN_UNKNOWN 0xffff + +typedef enum +{ + PVTI_OUTPUT_NEXT_DROP, + PVTI_OUTPUT_NEXT_INTERFACE_OUTPUT, + PVTI_OUTPUT_NEXT_IP4_LOOKUP, + PVTI_OUTPUT_NEXT_IP6_LOOKUP, + PVTI_OUTPUT_N_NEXT, +} pvti_output_next_t; + +#endif // pvti_output_h diff --git a/src/plugins/pvti/pvti.api b/src/plugins/pvti/pvti.api new file mode 100644 index 00000000000..859ed1ab6b0 --- /dev/null +++ b/src/plugins/pvti/pvti.api @@ -0,0 +1,111 @@ +/* + * pvti.api - binary API skeleton + * + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file pvti.api + * @brief VPP control-plane API messages. + * + * This file defines VPP control-plane binary API messages which are generally + * called through a shared memory interface. + */ + +/* Version and type recitations */ + +option version = "0.0.1"; +import "vnet/interface_types.api"; +import "vnet/ip/ip_types.api"; + +/** \brief A composite type uniquely defining a PVTI tunnel. + @param sw_if_index - ignored on create/delete, present in details. + @param src_ip - Source IP address + @param src_port - Source UDP port + @param dst_ip - Destination IP address + @param dst_port - Destination UDP port + @param underlay_mtu - Underlay MTU for packet splitting/coalescing + @param underlay_fib_index - Underlay FIB index to be used after encap +*/ +typedef pvti_tunnel +{ + vl_api_interface_index_t sw_if_index; + vl_api_address_t local_ip; + u16 local_port; + vl_api_address_t remote_ip; + bool peer_address_from_payload; + u16 remote_port; + u16 underlay_mtu; + u32 underlay_fib_index; +}; + + +/** @brief API to enable / disable pvti on an interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param enable_disable - 1 to enable, 0 to disable the feature + @param sw_if_index - interface handle +*/ + +define pvti_interface_create +{ + option status="in_progress"; + + /* Client identifier, set from api_main.my_client_index */ + u32 client_index; + + /* Arbitrary context, so client can match reply to request */ + u32 context; + vl_api_pvti_tunnel_t interface; +}; + +define pvti_interface_create_reply +{ + option status="in_progress"; + u32 context; + i32 retval; + + /* Index for the newly created interface */ + vl_api_interface_index_t sw_if_index; +}; + +autoreply define pvti_interface_delete { + option status="in_progress"; + + /* Client identifier, set from api_main.my_client_index */ + u32 client_index; + + /* Arbitrary context, so client can match reply to request */ + u32 context; + + vl_api_interface_index_t sw_if_index; +}; + + +define pvti_interface_dump +{ + option status="in_progress"; + u32 client_index; + u32 context; + vl_api_interface_index_t sw_if_index; +}; + +define pvti_interface_details +{ + option status="in_progress"; + u32 context; + vl_api_pvti_tunnel_t interface; +}; + + diff --git a/src/plugins/pvti/pvti.c b/src/plugins/pvti/pvti.c new file mode 100644 index 00000000000..524eabc6f3f --- /dev/null +++ b/src/plugins/pvti/pvti.c @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <vnet/fib/fib_table.h> +#include <pvti/pvti.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vpp/app/version.h> +#include <stdbool.h> + +#include <pvti/pvti.api_enum.h> +#include <pvti/pvti.api_types.h> + +#include <pvti/pvti_if.h> + +#define REPLY_MSG_ID_BASE pmp->msg_id_base +#include <vlibapi/api_helper_macros.h> +#include <vnet/ip/ip_format_fns.h> + +pvti_main_t pvti_main; + +u8 * +format_pvti_tx_peer_ptr (u8 *s, va_list *args) +{ + pvti_tx_peer_t *peer = va_arg (*args, pvti_tx_peer_t *); + + s = format ( + s, + "[%p]%s local:%U:%d remote:%U:%d underlay_mtu:%d underlay_fib_idx:%d " + "pvti_idx:%d b0_max_clen:%d cseq:%d chunk_count:%d reass_chunk_count:%d", + peer, peer->deleted ? " DELETED" : "", format_ip46_address, + &peer->local_ip, IP46_TYPE_ANY, peer->local_port, format_ip46_address, + &peer->remote_ip, IP46_TYPE_ANY, peer->remote_port, peer->underlay_mtu, + peer->underlay_fib_index, peer->pvti_if_index, + peer->bo0_max_current_length, peer->current_tx_seq, peer->chunk_count, + peer->reass_chunk_count); + + return (s); +} + +u8 * +format_pvti_rx_peer_ptr (u8 *s, va_list *args) +{ + pvti_rx_peer_t *peer = va_arg (*args, pvti_rx_peer_t *); + + s = format (s, "[%p]%s local:%U:%d remote:%U:%d pvti_idx:%d", peer, + peer->deleted ? " DELETED" : "", format_ip46_address, + &peer->local_ip, IP46_TYPE_ANY, peer->local_port, + format_ip46_address, &peer->remote_ip, IP46_TYPE_ANY, + peer->remote_port, peer->pvti_if_index); + + return (s); +} + +void +pvti_verify_initialized (pvti_main_t *pvm) +{ + if (!pvm->is_initialized) + { + const int n_threads = vlib_get_n_threads (); + vec_validate (pvm->per_thread_data[0], n_threads - 1); + vec_validate (pvm->per_thread_data[1], n_threads - 1); + pvm->is_initialized = 1; + } +} + +void +vnet_int_pvti_bypass_mode (u32 sw_if_index, u8 is_ip6, u8 is_enable) +{ + pvti_main_t *pvm = &pvti_main; + + if (pool_is_free_index (pvm->vnet_main->interface_main.sw_interfaces, + sw_if_index)) + return; + + pvti_verify_initialized (pvm); + + is_enable = !!is_enable; + + if (is_ip6) + { + if (clib_bitmap_get (pvm->bm_ip6_bypass_enabled_by_sw_if, sw_if_index) != + is_enable) + { + vnet_feature_enable_disable ("ip6-unicast", "ip6-pvti-bypass", + sw_if_index, is_enable, 0, 0); + pvm->bm_ip6_bypass_enabled_by_sw_if = clib_bitmap_set ( + pvm->bm_ip6_bypass_enabled_by_sw_if, sw_if_index, is_enable); + } + } + else + { + if (clib_bitmap_get (pvm->bm_ip4_bypass_enabled_by_sw_if, sw_if_index) != + is_enable) + { + vnet_feature_enable_disable ("ip4-unicast", "ip4-pvti-bypass", + sw_if_index, is_enable, 0, 0); + pvm->bm_ip4_bypass_enabled_by_sw_if = clib_bitmap_set ( + pvm->bm_ip4_bypass_enabled_by_sw_if, sw_if_index, is_enable); + } + } +} + +static clib_error_t * +set_ip_pvti_bypass (u32 is_ip6, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + clib_error_t *error = 0; + u32 sw_if_index, is_enable; + + sw_if_index = ~0; + is_enable = 1; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat_user (line_input, unformat_vnet_sw_interface, vnm, + &sw_if_index)) + ; + else if (unformat (line_input, "del")) + is_enable = 0; + else + { + error = unformat_parse_error (line_input); + goto done; + } + } + + if (~0 == sw_if_index) + { + error = clib_error_return (0, "unknown interface `%U'", + format_unformat_error, line_input); + goto done; + } + + vnet_int_pvti_bypass_mode (sw_if_index, is_ip6, is_enable); + +done: + unformat_free (line_input); + + return error; +} + +static clib_error_t * +set_ip4_pvti_bypass (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + return set_ip_pvti_bypass (0, input, cmd); +} + +VLIB_CLI_COMMAND (set_interface_ip_pvti_bypass_command, static) = { + .path = "set interface ip pvti-bypass", + .function = set_ip4_pvti_bypass, + .short_help = "set interface ip pvti-bypass <interface> [del]", +}; + +static clib_error_t * +set_ip6_pvti_bypass (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + return set_ip_pvti_bypass (1, input, cmd); +} + +VLIB_CLI_COMMAND (set_interface_ip6_pvti_bypass_command, static) = { + .path = "set interface ip6 pvti-bypass", + .function = set_ip6_pvti_bypass, + .short_help = "set interface ip6 pvti-bypass <interface> [del]", +}; + +static clib_error_t * +pvti_interface_create_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = 0; + + // pvti_main_t * pmp = &pvti_main; + u32 sw_if_index = ~0; + int rv = 0; + ip_address_t peer_ip = { 0 }; + ip_address_t local_ip = { 0 }; + u32 peer_port = 0; + u32 local_port = 12345; + u32 underlay_mtu = 1500; + u32 underlay_fib_index = ~0; + u32 underlay_table_id = ~0; + pvti_peer_address_method_t peer_address_method = PVTI_PEER_ADDRESS_FIXED; + bool peer_set = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "peer %U %d %d", unformat_ip_address, &peer_ip, + &peer_port, &local_port)) + { + peer_set = 1; + } + else if (unformat (line_input, "underlay-mtu %d", &underlay_mtu)) + { + // MTU set + } + else if (unformat (line_input, "local-ip %U", unformat_ip_address, + &local_ip)) + { + // local IP set + } + else if (unformat (line_input, "underlay-fib %d", &underlay_fib_index)) + { + // underlay fib set + } + else if (unformat (line_input, "peer-address-from-payload")) + { + peer_address_method = PVTI_PEER_ADDRESS_FROM_PAYLOAD; + } + else if (unformat (line_input, "underlay-table %d", &underlay_table_id)) + { + fib_protocol_t fib_proto = FIB_PROTOCOL_IP4; + if (peer_ip.version == AF_IP6) + { + fib_proto = FIB_PROTOCOL_IP6; + } + u32 fib_index = fib_table_find (fib_proto, underlay_table_id); + + if (~0 == fib_index) + { + error = clib_error_return (0, "Nonexistent table id %d", + underlay_table_id); + goto done; + } + underlay_fib_index = fib_index; + } + else + break; + } + if (!peer_set) + { + error = clib_error_return (0, "Please specify a peer..."); + goto done; + } + + rv = pvti_if_create (&local_ip, local_port, &peer_ip, peer_port, + peer_address_method, underlay_mtu, underlay_fib_index, + &sw_if_index); + + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_INVALID_SW_IF_INDEX: + error = clib_error_return (0, "Invalid interface"); + break; + + default: + error = clib_error_return (0, "pvti_if_create returned %d", rv); + } +done: + unformat_free (line_input); + return error; +} + +static clib_error_t * +pvti_interface_delete_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + // pvti_main_t * pmp = &pvti_main; + u32 sw_if_index = ~0; + int rv = 0; + bool if_index_set = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "if-index %d", &sw_if_index)) + { + if_index_set = 1; + } + else + break; + } + if (!if_index_set) + return clib_error_return (0, "Please specify a sw_if_index..."); + + rv = pvti_if_delete (sw_if_index); + + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_INVALID_SW_IF_INDEX: + return clib_error_return (0, "Invalid interface"); + break; + + default: + return clib_error_return (0, "pvti_if_delete returned %d", rv); + } + return 0; +} + +VLIB_CLI_COMMAND (pvti_interface_create_command, static) = { + .path = "pvti interface create", + .short_help = + "pvti interface create peer <remote-ip> <remote-port> <local-port> [ " + "local-ip <ip-addr> ][ underlay-mtu <MTU>][underlay-table " + "<table-index>][inderlay-fib <fib-index>]", + .function = pvti_interface_create_command_fn, +}; + +VLIB_CLI_COMMAND (pvti_interface_delete_command, static) = { + .path = "pvti interface delete", + .short_help = "pvti interface delete if-index <sw-ifindex>", + .function = pvti_interface_delete_command_fn, +}; + +static clib_error_t * +pvti_show_interface_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + pvti_if_t *pvti_if; + vec_foreach (pvti_if, pvti_main.if_pool) + { + int index = pvti_if - pvti_main.if_pool; + vlib_cli_output (vm, "%U", format_pvti_if, index); + }; + return 0; +} + +static clib_error_t * +pvti_show_tx_peers_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + pvti_per_thread_data_t *ptd; + int is_ip6; + for (is_ip6 = 0; is_ip6 <= 1; is_ip6++) + { + vec_foreach (ptd, pvti_main.per_thread_data[is_ip6]) + { + vlib_cli_output (vm, "thread %d (%s)", + ptd - pvti_main.per_thread_data[is_ip6], + is_ip6 ? "IPv6" : "IPv4"); + pvti_tx_peer_t *peer; + vec_foreach (peer, ptd->tx_peers) + { + vlib_cli_output (vm, " %U", format_pvti_tx_peer_ptr, peer); + } + } + } + return 0; +} + +static clib_error_t * +pvti_show_rx_peers_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + pvti_per_thread_data_t *ptd; + int is_ip6; + for (is_ip6 = 0; is_ip6 <= 1; is_ip6++) + { + vec_foreach (ptd, pvti_main.per_thread_data[is_ip6]) + { + vlib_cli_output (vm, "thread %d (%s)", + ptd - pvti_main.per_thread_data[is_ip6], + is_ip6 ? "IPv6" : "IPv4"); + pvti_rx_peer_t *peer; + vec_foreach (peer, ptd->rx_peers) + { + vlib_cli_output (vm, " %U", format_pvti_rx_peer_ptr, peer); + } + } + } + return 0; +} + +VLIB_CLI_COMMAND (pvti_show_interface_command, static) = { + .path = "show pvti interface", + .short_help = "show pvti interface", + .function = pvti_show_interface_command_fn, +}; + +VLIB_CLI_COMMAND (pvti_show_tx_peers_command, static) = { + .path = "show pvti tx peers", + .short_help = "show pvti tx peers", + .function = pvti_show_tx_peers_command_fn, +}; + +VLIB_CLI_COMMAND (pvti_show_rx_peers_command, static) = { + .path = "show pvti rx peers", + .short_help = "show pvti rx peers", + .function = pvti_show_rx_peers_command_fn, +}; + +void pvti_api_init (); + +VNET_FEATURE_INIT (pvti4_bypass, static) = { + .arc_name = "ip4-unicast", + .node_name = "ip4-pvti-bypass", + .runs_before = 0, +}; + +VNET_FEATURE_INIT (pvti6_bypass, static) = { + .arc_name = "ip6-unicast", + .node_name = "ip6-pvti-bypass", + .runs_before = 0, +}; + +static clib_error_t * +pvti_early_config (vlib_main_t *vm, unformat_input_t *input) +{ + clib_warning ("early config pvti"); + u8 *runs_before = 0; + int rbi = 0; + if (vec_len (vnet_feat_pvti4_bypass.runs_before) == 0) + { + rbi = 0; + } + else + { + rbi = vec_len (vnet_feat_pvti4_bypass.runs_before) - 1; + } + vec_validate (vnet_feat_pvti4_bypass.runs_before, rbi); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "runs-before %v", &runs_before)) + { + vec_add1 (runs_before, 0); + vnet_feat_pvti4_bypass.runs_before[rbi] = (char *) runs_before; + vec_add1 (vnet_feat_pvti4_bypass.runs_before, 0); + } + else + return clib_error_return (0, "unknown input"); + } + + return NULL; +} + +VLIB_EARLY_CONFIG_FUNCTION (pvti_early_config, "pvti"); + +static clib_error_t * +pvti_init (vlib_main_t *vm) +{ + pvti_main_t *pmp = &pvti_main; + clib_error_t *error = 0; + clib_warning ("pvti init"); + + pmp->vlib_main = vm; + pmp->vnet_main = vnet_get_main (); + pmp->is_initialized = 0; + + pvti_api_init (); + return error; +} + +VLIB_INIT_FUNCTION (pvti_init); + +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Packet Vector Tunnel Interface plugin", +}; diff --git a/src/plugins/pvti/pvti.h b/src/plugins/pvti/pvti.h new file mode 100644 index 00000000000..ac097c5ecca --- /dev/null +++ b/src/plugins/pvti/pvti.h @@ -0,0 +1,257 @@ +/* + * pvti.h - skeleton vpp engine plug-in header file + * + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_pvti_h__ +#define __included_pvti_h__ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> + +#define VPP_MAX_THREADS (1 << 8) + +#define MAX_RX_STREAMS 256 + +#define PVTI_ALIGN_BYTES 9 + +typedef CLIB_PACKED (struct { + u32 seq; + u8 stream_index; // set to the cpu# on the sending side + u8 chunk_count; + u8 reass_chunk_count; // number of chunks in the front that are related to + // previously started buffer + // mandatory_flags_mask highlights which of the flags cause packet drop if + // not understood, and which of them can be just ignored. + u8 mandatory_flags_mask; + u8 flags_value; + u8 pad_bytes; + u8 pad[0]; +}) pvti_packet_header_t; + +typedef CLIB_PACKED (struct { + ip4_header_t ip4; + udp_header_t udp; + // not part of encap header pvti_packet_header_t pv; +}) pvti_ip4_encap_header_t; + +typedef CLIB_PACKED (struct { + ip6_header_t ip6; + udp_header_t udp; + // not part of encap header pvti_packet_header_t pv; +}) pvti_ip6_encap_header_t; + +typedef CLIB_PACKED (struct { + u16 total_chunk_length; + // More fragments: this chunk is not the last block fragment +#define CHUNK_FLAGS_MF (1 << 0) + // More blocks: this block has chained blocks that follow +#define CHUNK_FLAGS_MB (1 << 1) + u16 _pad0; + u32 _pad1; + u8 chunk_data[0]; +}) pvti_chunk_header_t; + +typedef struct +{ + // a buffer being built from the smaller packets + u32 bi0; + + // how big can this buffer grow + u32 bi0_max_current_length; + + // how many chunks are already in the buffer + u8 chunk_count; + // leading reassembly chunk count + u8 reass_chunk_count; + + u32 current_tx_seq; +} pvti_per_tx_stream_data_t; + +typedef struct +{ + /* The seq# that we last processed */ + u32 last_rx_seq; + + // a current buffer that is being reassembled + u32 rx_bi0; + // The root buffer, most of the times == rx_bi0 except in the case of chained + // buffers. + u32 rx_bi0_first; + + // Next index for dispatch when the reassembly is done + u16 rx_next0; + // expected totall inner length for the packet + u16 rx_expected_inner_length; + u16 rx_received_inner_length; + +} pvti_per_rx_stream_data_t; + +typedef struct +{ + ip_address_t local_ip; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + u16 underlay_mtu; + u32 underlay_fib_index; + + u32 pvti_if_index; + bool deleted; + bool is_bo0_traced; + + u32 bo0_max_current_length; + + u8 chunk_count; + u8 reass_chunk_count; + u32 current_tx_seq; + vlib_buffer_t *bo0; + +} pvti_tx_peer_t; + +typedef struct +{ + ip_address_t local_ip; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + + pvti_per_rx_stream_data_t rx_streams[MAX_RX_STREAMS]; + + u32 pvti_if_index; + bool deleted; +} pvti_rx_peer_t; + +typedef struct +{ + /* pool of destination-based structures which are used to build the packets + */ + pvti_tx_peer_t *tx_peers; + + /* vector of buffers to send */ + u32 *pending_tx_buffers; + u16 *pending_tx_nexts; + /* pool of source-based structures for the remote peers' data tracking + */ + pvti_rx_peer_t *rx_peers; + + /* vector of buffers being decapsulated */ + u32 *pending_rx_buffers; + u16 *pending_rx_nexts; + +} pvti_per_thread_data_t; + +typedef struct +{ + ip_address_t local_ip; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + u16 underlay_mtu; + u32 underlay_fib_index; + bool peer_address_from_payload; + u64 created_at; + + u32 sw_if_index; + u32 hw_if_index; + + // per-stream data for TX + pvti_per_tx_stream_data_t tx_streams[256]; + pvti_per_rx_stream_data_t rx_streams[256]; + +} pvti_if_t; + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + + /* have we initialized the data structures ? */ + bool is_initialized; + + /* interface pool */ + pvti_if_t *if_pool; + + /* if_index in the pool above by sw_if_index */ + index_t *if_index_by_sw_if_index; + + /* indices by port */ + index_t **if_indices_by_port; + + /* per-thread data, ip4[0] and ip6[1] */ + pvti_per_thread_data_t *per_thread_data[2]; + + /* on/off switch for the periodic function */ + u8 periodic_timer_enabled; + /* Node index, non-zero if the periodic process has been created */ + u32 periodic_node_index; + + /* graph node state */ + uword *bm_ip4_bypass_enabled_by_sw_if; + uword *bm_ip6_bypass_enabled_by_sw_if; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ethernet_main_t *ethernet_main; +} pvti_main_t; + +extern pvti_main_t pvti_main; + +extern vlib_node_registration_t pvti_node; +extern vlib_node_registration_t pvti4_input_node; +extern vlib_node_registration_t pvti4_output_node; +extern vlib_node_registration_t pvti6_input_node; +extern vlib_node_registration_t pvti6_output_node; +extern vlib_node_registration_t pvti_periodic_node; + +always_inline u8 +pvti_get_stream_index (int is_ip6) +{ + u32 thread_index = vlib_get_thread_index (); + + ASSERT ((thread_index & 0xffffff80) == 0); + + u8 stream_index = (thread_index & 0x7f) | (is_ip6 ? 0x80 : 0); + return stream_index; +} + +/* attempt to get a new buffer */ +always_inline u32 +pvti_get_new_buffer (vlib_main_t *vm) +{ + u32 bi0 = INDEX_INVALID; + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + { + return INDEX_INVALID; + } + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + b0->current_data = 0; + b0->current_length = 0; + return bi0; +} + +/* Periodic function events */ +#define PVTI_EVENT1 1 +#define PVTI_EVENT2 2 +#define PVTI_EVENT_PERIODIC_ENABLE_DISABLE 3 + +void pvti_create_periodic_process (pvti_main_t *); +void pvti_verify_initialized (pvti_main_t *pvm); + +#endif /* __included_pvti_h__ */ diff --git a/src/plugins/pvti/pvti_if.c b/src/plugins/pvti/pvti_if.c new file mode 100644 index 00000000000..4f83994a1a4 --- /dev/null +++ b/src/plugins/pvti/pvti_if.c @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2020 Cisco and/or its affiliates. + * Copyright (c) 2020 Doc.ai and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/adj/adj_midchain.h> +#include <vnet/udp/udp.h> + +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +static u8 * +format_pvti_if_name (u8 *s, va_list *args) +{ + u32 dev_instance = va_arg (*args, u32); + // wg_if_t *wgi = wg_if_get (dev_instance); + return format (s, "pvti%d", dev_instance); +} + +u8 * +format_pvti_if (u8 *s, va_list *args) +{ + index_t pvtii = va_arg (*args, u32); + pvti_if_t *pvti_if = pvti_if_get (pvtii); + + s = format ( + s, "[%d] %U local:%U:%d remote:%U:%d underlay_mtu:%d underlay_fib_idx:%d", + pvtii, format_vnet_sw_if_index_name, vnet_get_main (), + pvti_if->sw_if_index, format_ip46_address, &pvti_if->local_ip, + IP46_TYPE_ANY, pvti_if->local_port, format_ip46_address, + &pvti_if->remote_ip, IP46_TYPE_ANY, pvti_if->remote_port, + pvti_if->underlay_mtu, pvti_if->underlay_fib_index); + + return (s); +} + +index_t +pvti_if_find_by_sw_if_index (u32 sw_if_index) +{ + if (vec_len (pvti_main.if_index_by_sw_if_index) <= sw_if_index) + return INDEX_INVALID; + u32 ti = pvti_main.if_index_by_sw_if_index[sw_if_index]; + if (ti == ~0) + return INDEX_INVALID; + + return (ti); +} + +index_t +pvti_if_find_by_remote_ip4_and_port (ip4_address_t *remote_ip4, + u16 remote_port) +{ + pvti_if_t *ifc; + pool_foreach (ifc, pvti_main.if_pool) + { + if ((ifc->remote_port == remote_port) && + (ifc->remote_ip.version == AF_IP4) && + ((ifc->remote_ip.ip.ip4.as_u32 == remote_ip4->as_u32) || + ifc->peer_address_from_payload)) + { + return (ifc - pvti_main.if_pool); + } + } + return INDEX_INVALID; +} + +index_t +pvti_if_find_by_remote_ip6_and_port (ip6_address_t *remote_ip6, + u16 remote_port) +{ + pvti_if_t *ifc; + pool_foreach (ifc, pvti_main.if_pool) + { + if ((ifc->remote_port == remote_port) && + (ifc->remote_ip.version == AF_IP6) && + ((0 == memcmp (&ifc->remote_ip.ip.ip6, remote_ip6, + sizeof (*remote_ip6))) || + ifc->peer_address_from_payload)) + { + return (ifc - pvti_main.if_pool); + } + } + return INDEX_INVALID; +} + +index_t +pvti_if_find_by_remote_ip_and_port (ip_address_t *remote_ip, u16 remote_port) +{ + pvti_if_t *ifc; + pool_foreach (ifc, pvti_main.if_pool) + { + if ((ifc->remote_port == remote_port) && + (ifc->peer_address_from_payload || + (0 == ip_address_cmp (remote_ip, &ifc->remote_ip)))) + { + return (ifc - pvti_main.if_pool); + } + } + return INDEX_INVALID; +} + +static void +pvti_add_tidx_by_port (index_t t_index, u16 port) +{ + pvti_main_t *pvm = &pvti_main; + vec_validate_init_empty (pvm->if_indices_by_port, port, NULL); + vec_add1 (pvm->if_indices_by_port[port], t_index); +} + +static void +pvti_del_tidx_by_port (index_t t_index, u16 port) +{ + pvti_main_t *pvm = &pvti_main; + index_t *ii; + if (!pvm->if_indices_by_port) + { + return; + } + if (port >= vec_len (pvm->if_indices_by_port)) + { + return; + } + if (vec_len (pvm->if_indices_by_port[port]) == 0) + { + ALWAYS_ASSERT (pvm->if_indices_by_port[port] > 0); + /* not reached */ + return; + } + + vec_foreach (ii, pvm->if_indices_by_port[port]) + { + if (*ii == t_index) + { + vec_del1 (pvm->if_indices_by_port[port], + pvm->if_indices_by_port[port] - ii); + break; + } + } +} + +static u32 +pvti_get_tunnel_count_by_port (u16 port) +{ + pvti_main_t *pvm = &pvti_main; + if (!pvm->if_indices_by_port) + { + return 0; + } + return vec_len (vec_elt (pvm->if_indices_by_port, port)); +} + +static clib_error_t * +pvti_if_admin_up_down (vnet_main_t *vnm, u32 hw_if_index, u32 flags) +{ + // vnet_hw_interface_t *hi; + u32 hw_flags; + + // hi = vnet_get_hw_interface (vnm, hw_if_index); + hw_flags = + (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP ? VNET_HW_INTERFACE_FLAG_LINK_UP : + 0); + vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags); + + return (NULL); +} + +void +pvti_if_update_adj (vnet_main_t *vnm, u32 sw_if_index, adj_index_t ai) +{ + + /* Convert any neighbour adjacency that has a next-hop reachable through + * the wg interface into a midchain. This is to avoid sending ARP/ND to + * resolve the next-hop address via the wg interface. Then, if one of the + * peers has matching prefix among allowed prefixes, the midchain will be + * updated to the corresponding one. + */ + adj_nbr_midchain_update_rewrite (ai, NULL, NULL, ADJ_FLAG_NONE, NULL); + + // wgii = wg_if_find_by_sw_if_index (sw_if_index); + // wg_if_peer_walk (wg_if_get (wgii), wg_peer_if_adj_change, &ai); +} + +VNET_DEVICE_CLASS (pvti_if_device_class) = { + .name = "Packet Vectorizer Tunnel", + .format_device_name = format_pvti_if_name, + .admin_up_down_function = pvti_if_admin_up_down, +}; + +VNET_HW_INTERFACE_CLASS (pvti_hw_interface_class) = { + .name = "PVTunnel", + .update_adjacency = pvti_if_update_adj, + .flags = VNET_HW_INTERFACE_CLASS_FLAG_P2P, + // .flags = VNET_HW_INTERFACE_CLASS_FLAG_NBMA, +}; + +int +pvti_if_create (ip_address_t *local_ip, u16 local_port, + ip_address_t *remote_ip, u16 remote_port, + pvti_peer_address_method_t peer_address_method, + u16 underlay_mtu, u32 underlay_fib_index, u32 *sw_if_indexp) +{ + vnet_main_t *vnm = vnet_get_main (); + pvti_main_t *pvm = &pvti_main; + u32 hw_if_index; + vnet_hw_interface_t *hi; + pvti_verify_initialized (pvm); + + pvti_if_t *pvti_if; + + ASSERT (sw_if_indexp); + + *sw_if_indexp = (u32) ~0; + + pool_get_zero (pvti_main.if_pool, pvti_if); + pvti_if->local_ip = *local_ip; + pvti_if->local_port = local_port; + pvti_if->remote_ip = *remote_ip; + if (peer_address_method == PVTI_PEER_ADDRESS_FROM_PAYLOAD) + { + pvti_if->peer_address_from_payload = 1; + } + pvti_if->remote_port = remote_port; + pvti_if->underlay_mtu = underlay_mtu; + pvti_if->underlay_fib_index = underlay_fib_index; + pvti_if->created_at = clib_cpu_time_now (); + + /* tunnel index (or instance) */ + u32 t_idx = pvti_if - pvti_main.if_pool; + + hw_if_index = + vnet_register_interface (vnm, pvti_if_device_class.index, t_idx, + pvti_hw_interface_class.index, t_idx); + + pvti_if->hw_if_index = hw_if_index; + + hi = vnet_get_hw_interface (vnm, hw_if_index); + pvti_if->sw_if_index = *sw_if_indexp = hi->sw_if_index; + + vec_validate_init_empty (pvm->if_index_by_sw_if_index, hi->sw_if_index, + INDEX_INVALID); + + vec_elt (pvm->if_index_by_sw_if_index, hi->sw_if_index) = t_idx; + pvti_if_t *pvti_if0 = pool_elt_at_index (pvti_main.if_pool, t_idx); + int i; + for (i = 0; i < 256; i++) + { + pvti_if0->tx_streams[i].bi0 = INDEX_INVALID; + pvti_if0->tx_streams[i].current_tx_seq = 42; + + pvti_if0->rx_streams[i].rx_bi0 = INDEX_INVALID; + pvti_if0->rx_streams[i].rx_bi0_first = INDEX_INVALID; + } + + /* + int is_ip6 = 0; + u32 encap_index = !is_ip6 ? + pvti4_output_node.index : pvti6_output_node.index; + vnet_set_interface_output_node (vnm, pvti_if->hw_if_index, encap_index); + */ + vnet_set_interface_l3_output_node (vnm->vlib_main, hi->sw_if_index, + (u8 *) "pvti4-output"); + + pvti_add_tidx_by_port (t_idx, local_port); + if (1 == pvti_get_tunnel_count_by_port (local_port)) + { + clib_warning ("Registering local port %d", local_port); + udp_register_dst_port (vlib_get_main (), local_port, + pvti4_input_node.index, UDP_IP4); + udp_register_dst_port (vlib_get_main (), local_port, + pvti6_input_node.index, UDP_IP6); + } + else + { + clib_warning ("Not registering the port"); + } + + vnet_hw_interface_set_flags (vnm, pvti_if->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + + return 0; +} + +void +pvti_if_walk (pvti_if_walk_cb_t fn, void *data) +{ + index_t pvtii; + + pool_foreach_index (pvtii, pvti_main.if_pool) + { + if (WALK_STOP == fn (pvtii, data)) + break; + } +} + +int +pvti_if_delete (u32 sw_if_index) +{ + vnet_main_t *vnm = vnet_get_main (); + pvti_main_t *pvm = &pvti_main; + + if (pool_is_free_index (vnm->interface_main.sw_interfaces, sw_if_index)) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + vnet_hw_interface_t *hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + if (hw == 0 || hw->dev_class_index != pvti_if_device_class.index) + return VNET_API_ERROR_INVALID_VALUE; + + pvti_if_t *ifc; + bool found = 0; + pool_foreach (ifc, pvm->if_pool) + { + if (ifc->sw_if_index == sw_if_index) + { + found = 1; + break; + } + } + if (!found) + { + return VNET_API_ERROR_INVALID_VALUE_2; + } + index_t tidx = ifc - pvm->if_pool; + + u16 local_port = ifc->local_port; + pvti_del_tidx_by_port (tidx, local_port); + pvm->if_index_by_sw_if_index[sw_if_index] = INDEX_INVALID; + + if (0 == pvti_get_tunnel_count_by_port (local_port)) + { + udp_unregister_dst_port (vlib_get_main (), local_port, 1); + udp_unregister_dst_port (vlib_get_main (), local_port, 0); + } + + vnet_reset_interface_l3_output_node (vnm->vlib_main, sw_if_index); + vnet_delete_hw_interface (vnm, hw->hw_if_index); + pool_put (pvti_main.if_pool, ifc); + + /* mark per-thread peers as deleted */ + pvti_per_thread_data_t *ptd; + + vec_foreach (ptd, pvm->per_thread_data[0]) + { + pvti_tx_peer_t *peer; + vec_foreach (peer, ptd->tx_peers) + { + if (tidx == peer->pvti_if_index) + { + peer->deleted = 1; + } + } + } + vec_foreach (ptd, pvm->per_thread_data[1]) + { + pvti_tx_peer_t *peer; + vec_foreach (peer, ptd->tx_peers) + { + if (tidx == peer->pvti_if_index) + { + peer->deleted = 1; + } + } + } + + return 0; +} diff --git a/src/plugins/pvti/pvti_if.h b/src/plugins/pvti/pvti_if.h new file mode 100644 index 00000000000..44bf22ce825 --- /dev/null +++ b/src/plugins/pvti/pvti_if.h @@ -0,0 +1,47 @@ +#ifndef PVTI_IF_H +#define PVTI_IF_H + +#include <vnet/interface_funcs.h> + +typedef enum +{ + PVTI_PEER_ADDRESS_FIXED = 0, + PVTI_PEER_ADDRESS_FROM_PAYLOAD +} pvti_peer_address_method_t; + +typedef walk_rc_t (*pvti_if_walk_cb_t) (index_t wgi, void *data); +void pvti_if_walk (pvti_if_walk_cb_t fn, void *data); + +int pvti_if_create (ip_address_t *local_ip, u16 local_port, + ip_address_t *remote_ip, u16 remote_port, + pvti_peer_address_method_t peer_address_method, + u16 underlay_mtu, u32 underlay_fib_index, + u32 *sw_if_indexp); +index_t pvti_if_find_by_sw_if_index (u32 sw_if_index); +index_t pvti_if_find_by_remote_ip4_and_port (ip4_address_t *remote_ip4, + u16 remote_port); +index_t pvti_if_find_by_remote_ip6_and_port (ip6_address_t *remote_ip4, + u16 remote_port); + +index_t pvti_if_find_by_remote_ip_and_port (ip_address_t *remote_ip, + u16 remote_port); + +int pvti_if_delete (u32 sw_if_index); + +u8 *format_pvti_if (u8 *s, va_list *args); + +static_always_inline pvti_if_t * +pvti_if_get (index_t pvtii) +{ + if (INDEX_INVALID == pvtii) + return (NULL); + return (pool_elt_at_index (pvti_main.if_pool, pvtii)); +} + +static_always_inline index_t +pvti_if_get_index (pvti_if_t *pvti_if) +{ + return pvti_if - pvti_main.if_pool; +} + +#endif |