diff options
author | Andrew Yourtchenko <ayourtch@gmail.com> | 2024-01-23 11:57:51 +0000 |
---|---|---|
committer | Damjan Marion <dmarion@0xa5.net> | 2024-09-17 12:10:19 +0000 |
commit | 0acb398d6d13b03514bcda72a7b52ece28932b1f (patch) | |
tree | 727b8c8d9a1dc6758d6714b7ea1291b924aa61a6 /src/plugins/pvti | |
parent | 6ccfc3991de2e5e1c5cd12147ecdae39eee9d675 (diff) |
pvti: Packet Vector Tunnel Interface
This plugin implements a PoC of UDP-based tunnel substrate whose aim is
to specifically provide higher MTU to the upper layers by chunking
the payload PDUs into smaller packets with full 5-tuple.
At the same time, if there are multiple small packets to
the same destination during the vector processing, they
are packed into "carrier" packets up to underlay MTU size.
It does assume a trustworthy underlying medium, thus for the
operation over Internet it requires the use of encryption layer
underneath.
Type: feature
Change-Id: I323958fa8de62584f6ed15643ea689568a9a62bc
Signed-off-by: Andrew Yourtchenko <ayourtch@gmail.com>
Diffstat (limited to 'src/plugins/pvti')
-rw-r--r-- | src/plugins/pvti/CMakeLists.txt | 40 | ||||
-rw-r--r-- | src/plugins/pvti/FEATURE.yaml | 8 | ||||
-rw-r--r-- | src/plugins/pvti/api.c | 137 | ||||
-rw-r--r-- | src/plugins/pvti/bypass-main.c | 79 | ||||
-rw-r--r-- | src/plugins/pvti/bypass.c | 202 | ||||
-rw-r--r-- | src/plugins/pvti/bypass.h | 53 | ||||
-rw-r--r-- | src/plugins/pvti/input-main.c | 115 | ||||
-rw-r--r-- | src/plugins/pvti/input.c | 496 | ||||
-rw-r--r-- | src/plugins/pvti/input.h | 87 | ||||
-rw-r--r-- | src/plugins/pvti/output-main.c | 85 | ||||
-rw-r--r-- | src/plugins/pvti/output.c | 543 | ||||
-rw-r--r-- | src/plugins/pvti/output.h | 75 | ||||
-rw-r--r-- | src/plugins/pvti/pvti.api | 111 | ||||
-rw-r--r-- | src/plugins/pvti/pvti.c | 481 | ||||
-rw-r--r-- | src/plugins/pvti/pvti.h | 257 | ||||
-rw-r--r-- | src/plugins/pvti/pvti_if.c | 376 | ||||
-rw-r--r-- | src/plugins/pvti/pvti_if.h | 47 |
17 files changed, 3192 insertions, 0 deletions
diff --git a/src/plugins/pvti/CMakeLists.txt b/src/plugins/pvti/CMakeLists.txt new file mode 100644 index 00000000000..900b662d54a --- /dev/null +++ b/src/plugins/pvti/CMakeLists.txt @@ -0,0 +1,40 @@ +# Copyright (c) 2024 Cisco and/or its affiliates. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_vpp_plugin(pvti + SOURCES + pvti_if.c + pvti.c + input.h + input.c + input-main.c + output.h + output.c + output-main.c + bypass.h + bypass.c + bypass-main.c + api.c + pvti.h + + MULTIARCH_SOURCES + input.c + output.c + bypass.c + + API_FILES + pvti.api + + # API_TEST_SOURCES + # pvti_test.c +) diff --git a/src/plugins/pvti/FEATURE.yaml b/src/plugins/pvti/FEATURE.yaml new file mode 100644 index 00000000000..52dbe5b7c1b --- /dev/null +++ b/src/plugins/pvti/FEATURE.yaml @@ -0,0 +1,8 @@ +--- +name: Packet Vector Tunnel +maintainer: Andrew Yourtchenko <ayourtch@gmail.com> +features: + - support inner MTU up to ~8K over standard 1280..1500 MTU substrate +description: "Large MTU Tunnels" +state: development +properties: [API, CLI] diff --git a/src/plugins/pvti/api.c b/src/plugins/pvti/api.c new file mode 100644 index 00000000000..cda39ad44e8 --- /dev/null +++ b/src/plugins/pvti/api.c @@ -0,0 +1,137 @@ + +#include <vnet/vnet.h> +#include <vlibmemory/api.h> + +#include <vnet/format_fns.h> +#include <vnet/ip/ip_types_api.h> +#include <vlibapi/api.h> + +#include <pvti/pvti.api_enum.h> +#include <pvti/pvti.api_types.h> + +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +#define REPLY_MSG_ID_BASE pvm->msg_id_base +#include <vlibapi/api_helper_macros.h> + +typedef struct +{ + vl_api_registration_t *reg; + u32 context; +} pvti_if_details_ctx_t; + +typedef struct +{ + +} pvti_interface_dump_ctx_t; + +static walk_rc_t +pvti_if_send_details (index_t pvtii, void *data) +{ + vl_api_pvti_interface_details_t *rmp; + pvti_if_details_ctx_t *ctx = data; + const pvti_if_t *pvi; + + pvi = pvti_if_get (pvtii); + + rmp = vl_msg_api_alloc_zero (sizeof (*rmp)); + rmp->_vl_msg_id = + htons (VL_API_PVTI_INTERFACE_DETAILS + pvti_main.msg_id_base); + + rmp->interface.sw_if_index = htonl (pvi->sw_if_index); + rmp->interface.local_port = htons (pvi->local_port); + rmp->interface.remote_port = htons (pvi->remote_port); + rmp->interface.underlay_mtu = htons (pvi->underlay_mtu); + + ip_address_encode2 (&pvi->local_ip, &rmp->interface.local_ip); + ip_address_encode2 (&pvi->remote_ip, &rmp->interface.remote_ip); + + rmp->context = ctx->context; + + vl_api_send_msg (ctx->reg, (u8 *) rmp); + + return (WALK_CONTINUE); +} + +static void +vl_api_pvti_interface_dump_t_handler (vl_api_pvti_interface_dump_t *mp) +{ + vl_api_registration_t *reg; + // pvti_main_t *pvm = &pvti_main; + + reg = vl_api_client_index_to_registration (mp->client_index); + if (reg == 0) + return; + + pvti_if_details_ctx_t ctx = { + .reg = reg, + .context = mp->context, + }; + + u32 sw_if_index = ntohl (mp->sw_if_index); + if (sw_if_index == ~0) + pvti_if_walk (pvti_if_send_details, &ctx); + else + { + index_t pvtii = pvti_if_find_by_sw_if_index (sw_if_index); + if (pvtii != INDEX_INVALID) + pvti_if_send_details (pvtii, &ctx); + } +} + +static void +vl_api_pvti_interface_create_t_handler (vl_api_pvti_interface_create_t *mp) +{ + vl_api_pvti_interface_create_reply_t *rmp; + pvti_main_t *pvm = &pvti_main; + int rv = ~0; + u32 sw_if_index = ~0; + ip_address_t local_ip; + ip_address_t remote_ip; + + ip_address_decode2 (&mp->interface.local_ip, &local_ip); + ip_address_decode2 (&mp->interface.remote_ip, &remote_ip); + u16 lport = clib_host_to_net_u16 (mp->interface.local_port); + u16 rport = clib_host_to_net_u16 (mp->interface.remote_port); + u16 underlay_mtu = clib_host_to_net_u16 (mp->interface.underlay_mtu); + u32 underlay_fib_index = + clib_host_to_net_u32 (mp->interface.underlay_fib_index); + pvti_peer_address_method_t peer_address_method = + mp->interface.peer_address_from_payload ? PVTI_PEER_ADDRESS_FROM_PAYLOAD : + PVTI_PEER_ADDRESS_FIXED; + + if (underlay_mtu == 0) + { + underlay_mtu = 1500; + } + + rv = + pvti_if_create (&local_ip, lport, &remote_ip, rport, peer_address_method, + underlay_mtu, underlay_fib_index, &sw_if_index); + + REPLY_MACRO2 (VL_API_PVTI_INTERFACE_CREATE_REPLY, + { rmp->sw_if_index = htonl (sw_if_index); }); +} + +static void +vl_api_pvti_interface_delete_t_handler (vl_api_pvti_interface_delete_t *mp) +{ + vl_api_pvti_interface_delete_reply_t *rmp; + pvti_main_t *pvm = &pvti_main; + int rv = 0; + + rv = pvti_if_delete (ntohl (mp->sw_if_index)); + REPLY_MACRO (VL_API_PVTI_INTERFACE_DELETE_REPLY); +} + +/* API definitions */ +#include <pvti/pvti.api.c> + +void +pvti_api_init () +{ + pvti_main_t *pvm = &pvti_main; + /* Add our API messages to the global name_crc hash table */ + pvm->msg_id_base = setup_message_id_table (); +} diff --git a/src/plugins/pvti/bypass-main.c b/src/plugins/pvti/bypass-main.c new file mode 100644 index 00000000000..db79ccd2113 --- /dev/null +++ b/src/plugins/pvti/bypass-main.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <pvti/bypass.h> + +/* packet trace format function */ +static u8 * +format_pvti_bypass_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + pvti_bypass_trace_t *t = va_arg (*args, pvti_bypass_trace_t *); + + s = format (s, "PVTI-BYPASS: sw_if_index %d, next index %d\n", + t->sw_if_index, t->next_index); + s = format (s, " src %U sport %d dport %d\n", format_ip_address, + &t->remote_ip, t->remote_port, t->local_port); + s = format (s, " seq: %d", t->seq); + return s; +} + +vlib_node_registration_t pvti4_bypass_node; +vlib_node_registration_t pvti6_bypass_node; + +static char *pvti_bypass_error_strings[] = { +#define _(sym, string) string, + foreach_pvti_bypass_error +#undef _ +}; + +VLIB_REGISTER_NODE (pvti4_bypass_node) = +{ + .name = "ip4-pvti-bypass", + .vector_size = sizeof (u32), + .format_trace = format_pvti_bypass_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(pvti_bypass_error_strings), + .error_strings = pvti_bypass_error_strings, + + .n_next_nodes = PVTI_BYPASS_N_NEXT, + + .next_nodes = { + [PVTI_BYPASS_NEXT_DROP] = "error-drop", + [PVTI_BYPASS_NEXT_PVTI_INPUT] = "pvti4-input", + }, + +}; + +VLIB_REGISTER_NODE (pvti6_bypass_node) = +{ + .name = "ip6-pvti-bypass", + .vector_size = sizeof (u32), + .format_trace = format_pvti_bypass_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(pvti_bypass_error_strings), + .error_strings = pvti_bypass_error_strings, + + .n_next_nodes = PVTI_BYPASS_N_NEXT, + + .next_nodes = { + [PVTI_BYPASS_NEXT_DROP] = "error-drop", + [PVTI_BYPASS_NEXT_PVTI_INPUT] = "pvti6-input", + }, + +}; diff --git a/src/plugins/pvti/bypass.c b/src/plugins/pvti/bypass.c new file mode 100644 index 00000000000..14c976439eb --- /dev/null +++ b/src/plugins/pvti/bypass.c @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> +#include <pvti/bypass.h> + +always_inline u16 +pvti_bypass_node_common (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame, bool is_ip6) +{ + u32 n_left_from, *from, *to_next; + pvti_bypass_next_t next_index; + vlib_node_runtime_t *error_node = + vlib_node_get_runtime (vm, ip4_input_node.index); + + u32 pkts_processed = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t *b0; + u32 sw_if_index0 = 0; + ip4_header_t *ip40; + ip6_header_t *ip60; + udp_header_t *udp0; + u32 bi0, ip_len0, udp_len0, flags0, next0; + u8 error0, good_udp0, proto0; + i32 len_diff0; + + bi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* setup the packet for the next feature */ + vnet_feature_next (&next0, b0); + + if (is_ip6) + { + ip60 = vlib_buffer_get_current (b0); + } + else + { + ip40 = vlib_buffer_get_current (b0); + } + + if (is_ip6) + { + proto0 = ip60->protocol; + } + else + { + /* Treat IP frag packets as "experimental" protocol for now */ + proto0 = ip4_is_fragment (ip40) ? 0xfe : ip40->protocol; + } + + /* Process packet 0 */ + if (proto0 != IP_PROTOCOL_UDP) + goto exit; /* not UDP packet */ + + if (is_ip6) + udp0 = ip6_next_header (ip60); + else + udp0 = ip4_next_header (ip40); + + /* look up the destination ip and port */ + u32 pvti_index0 = INDEX_INVALID; + if (is_ip6) + { + pvti_index0 = pvti_if_find_by_remote_ip6_and_port ( + &ip60->src_address, clib_net_to_host_u16 (udp0->src_port)); + } + else + { + pvti_index0 = pvti_if_find_by_remote_ip4_and_port ( + &ip40->src_address, clib_net_to_host_u16 (udp0->src_port)); + } + if (pvti_index0 == INDEX_INVALID) + goto exit; + + flags0 = b0->flags; + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + + /* Don't verify UDP checksum for packets with explicit zero checksum. + */ + good_udp0 |= udp0->checksum == 0; + + /* Verify UDP length */ + if (is_ip6) + ip_len0 = clib_net_to_host_u16 (ip60->payload_length); + else + ip_len0 = clib_net_to_host_u16 (ip40->length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + len_diff0 = ip_len0 - udp_len0; + + /* Verify UDP checksum */ + if (PREDICT_FALSE (!good_udp0)) + { + if (is_ip6) + flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, b0); + else + flags0 = ip4_tcp_udp_validate_checksum (vm, b0); + good_udp0 = (flags0 & VNET_BUFFER_F_L4_CHECKSUM_CORRECT) != 0; + } + + if (is_ip6) + { + error0 = good_udp0 ? 0 : IP6_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP6_ERROR_UDP_LENGTH; + } + else + { + error0 = good_udp0 ? 0 : IP4_ERROR_UDP_CHECKSUM; + error0 = (len_diff0 >= 0) ? error0 : IP4_ERROR_UDP_LENGTH; + } + + next0 = error0 ? PVTI_BYPASS_NEXT_DROP : PVTI_BYPASS_NEXT_PVTI_INPUT; + b0->error = error0 ? error_node->errors[error0] : 0; + + /* pvtiX-input node expect current at PVTI header */ + if (is_ip6) + vlib_buffer_advance (b0, sizeof (ip6_header_t) + + sizeof (udp_header_t)); + else + vlib_buffer_advance (b0, sizeof (ip4_header_t) + + sizeof (udp_header_t)); + exit: + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + pvti_bypass_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + t->seq = 0; // clib_net_to_host_u32 (pvti0->seq); + if (is_ip6) + { + } + else + { + t->remote_ip.ip.ip4 = ip40->src_address; + t->remote_ip.version = AF_IP4; + } + // t->local_port = h0->udp.dst_port; + // t->remote_port = h0->udp.src_port; + } + + pkts_processed += 1; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, to_next, + n_left_to_next, bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, node->node_index, + PVTI_BYPASS_ERROR_PROCESSED, pkts_processed); + return frame->n_vectors; +} + +VLIB_NODE_FN (pvti4_bypass_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_bypass_node_common (vm, node, frame, 0); +} + +VLIB_NODE_FN (pvti6_bypass_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_bypass_node_common (vm, node, frame, 1); +} diff --git a/src/plugins/pvti/bypass.h b/src/plugins/pvti/bypass.h new file mode 100644 index 00000000000..611d5770ad3 --- /dev/null +++ b/src/plugins/pvti/bypass.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_pvti_bypass_h__ +#define __included_pvti_bypass_h__ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +typedef struct +{ + u32 next_index; + u32 sw_if_index; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + u32 seq; +} pvti_bypass_trace_t; + +#define foreach_pvti_bypass_error \ + _ (PROCESSED, "PVTI bypass tunnel packets processed") + +typedef enum +{ +#define _(sym, str) PVTI_BYPASS_ERROR_##sym, + foreach_pvti_bypass_error +#undef _ + PVTI_BYPASS_N_ERROR, +} pvti_bypass_error_t; + +typedef enum +{ + PVTI_BYPASS_NEXT_DROP, + PVTI_BYPASS_NEXT_PVTI_INPUT, + PVTI_BYPASS_N_NEXT, +} pvti_bypass_next_t; + +#endif // pvti_bypass_h diff --git a/src/plugins/pvti/input-main.c b/src/plugins/pvti/input-main.c new file mode 100644 index 00000000000..8ab8b18dd7c --- /dev/null +++ b/src/plugins/pvti/input-main.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <pvti/input.h> + +static char *pvti_input_error_strings[] = { +#define _(sym, string) string, + foreach_pvti_input_error +#undef _ +}; + +#define _(f, s) s, +static char *pvti_input_trace_type_names[] = { foreach_pvti_input_trace_type }; +#undef _ + +static char * +get_pvti_trace_type_name (u8 ptype) +{ + if (ptype < PVTI_INPUT_TRACE_N_TYPES) + { + return pvti_input_trace_type_names[ptype]; + } + else + { + return "unknown"; + } +} + +/* packet trace format function */ +static u8 * +format_pvti_input_trace (u8 *s, va_list *args) +{ + int i; + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + pvti_input_trace_t *t = va_arg (*args, pvti_input_trace_t *); + + u32 indent = format_get_indent (s); + + s = format (s, + "PVTI-IN: sw_if_index %d, next index %d, trace_type: %s(%d), " + "chunkcnt: %d\n", + t->sw_if_index, t->next_index, + get_pvti_trace_type_name (t->trace_type), t->trace_type, + t->chunk_count); + s = format (s, " src %U sport %d dport %d\n", format_ip_address, + &t->remote_ip, t->remote_port, t->local_port); + s = format (s, " seq: %d, chunk_count: %d\n", t->seq, t->chunk_count); + u16 max = t->chunk_count > MAX_CHUNKS ? MAX_CHUNKS : t->chunk_count; + for (i = 0; i < max; i++) + { + s = format (s, " %02d: sz %d\n", i, t->chunks[i].total_chunk_length); + } + s = format (s, "\n%U%U", format_white_space, indent, + format_ip_adjacency_packet_data, t->packet_data, + sizeof (t->packet_data)); + + return s; +} + +vlib_node_registration_t pvti4_input_node; +vlib_node_registration_t pvti6_input_node; + +VLIB_REGISTER_NODE (pvti4_input_node) = +{ + .name = "pvti4-input", + .vector_size = sizeof (u32), + .format_trace = format_pvti_input_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(pvti_input_error_strings), + .error_strings = pvti_input_error_strings, + + .n_next_nodes = PVTI_INPUT_N_NEXT, + + .next_nodes = { + [PVTI_INPUT_NEXT_DROP] = "error-drop", + [PVTI_INPUT_NEXT_IP4_INPUT] = "ip4-input-no-checksum", + [PVTI_INPUT_NEXT_IP6_INPUT] = "ip6-input", + [PVTI_INPUT_NEXT_PUNT] = "error-punt", + }, + +}; +VLIB_REGISTER_NODE (pvti6_input_node) = +{ + .name = "pvti6-input", + .vector_size = sizeof (u32), + .format_trace = format_pvti_input_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(pvti_input_error_strings), + .error_strings = pvti_input_error_strings, + + .n_next_nodes = PVTI_INPUT_N_NEXT, + + .next_nodes = { + [PVTI_INPUT_NEXT_DROP] = "error-drop", + [PVTI_INPUT_NEXT_IP4_INPUT] = "ip4-input-no-checksum", + [PVTI_INPUT_NEXT_IP6_INPUT] = "ip6-input", + [PVTI_INPUT_NEXT_PUNT] = "error-punt", + }, + +}; diff --git a/src/plugins/pvti/input.c b/src/plugins/pvti/input.c new file mode 100644 index 00000000000..6a8806e2795 --- /dev/null +++ b/src/plugins/pvti/input.c @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> +#include <pvti/input.h> + +always_inline void +pvti_enqueue_rx_bi_to_next_and_trace (vlib_main_t *vm, + vlib_node_runtime_t *node, + pvti_per_thread_data_t *ptd, u32 bi0, + u16 next0) +{ + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + + if (PREDICT_TRUE (vlib_trace_buffer (vm, node, next0, b0, + /* follow_chain */ 0))) + { + pvti_input_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + t->trace_type = PVTI_INPUT_TRACE_decap; + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + vec_add1 (ptd->pending_rx_buffers, bi0); + vec_add1 (ptd->pending_rx_nexts, next0); +} + +always_inline pvti_rx_peer_t * +pvti_try_find_or_create_rx_peer (pvti_per_thread_data_t *ptd, + vlib_buffer_t *b0, bool is_ip6) +{ + pvti_rx_peer_t *peer; + + ip_address_t remote_ip = { 0 }; + u16 remote_port; + if (is_ip6) + { + pvti_ip6_encap_header_t *h0 = + ((pvti_ip6_encap_header_t *) vlib_buffer_get_current (b0)) - 1; + ip_address_set (&remote_ip, &h0->ip6.src_address, AF_IP6); + remote_port = clib_net_to_host_u16 (h0->udp.src_port); + } + else + { + pvti_ip4_encap_header_t *h0 = + ((pvti_ip4_encap_header_t *) vlib_buffer_get_current (b0)) - 1; + ip_address_set (&remote_ip, &h0->ip4.src_address, AF_IP4); + remote_port = clib_net_to_host_u16 (h0->udp.src_port); + } + + pool_foreach (peer, ptd->rx_peers) + { + if (peer->remote_port == remote_port && + 0 == ip_address_cmp (&remote_ip, &peer->remote_ip)) + { + if (peer->deleted) + { + // The peer has been marked as deleted - wipe it. + clib_memset (peer, 0xca, sizeof (*peer)); + pool_put (ptd->rx_peers, peer); + continue; + } + return peer; + } + } + + index_t pvti_if_index0 = + pvti_if_find_by_remote_ip_and_port (&remote_ip, remote_port); + if (INDEX_INVALID == pvti_if_index0) + { + // no suitable interface found, bail + return 0; + } + pvti_if_t *pvti_if0 = pvti_if_get (pvti_if_index0); + + pvti_rx_peer_t new_peer = { + .local_ip = pvti_if0->local_ip, + .local_port = pvti_if0->local_port, + .remote_ip = remote_ip, + .remote_port = remote_port, + .pvti_if_index = pvti_if_index0, + .rx_streams = { { 0 } }, + }; + pvti_rx_peer_t *rx_new_peer; + pool_get (ptd->rx_peers, rx_new_peer); + *rx_new_peer = new_peer; + + int i; + for (i = 0; i < MAX_RX_STREAMS; i++) + { + rx_new_peer->rx_streams[i].rx_bi0 = INDEX_INVALID; + rx_new_peer->rx_streams[i].rx_bi0_first = INDEX_INVALID; + rx_new_peer->rx_streams[i].rx_next0 = 0; + } + + return rx_new_peer; +} + +always_inline u16 +pvti_input_node_common (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame, bool is_ip6) +{ + u32 n_left_from, *from; + pvti_chunk_header_t *chunks[MAX_CHUNKS]; + u32 pkts_processed = 0; + u32 pkts_decapsulated = 0; + u32 decap_failed_no_buffers = 0; + + pvti_main_t *pvm = &pvti_main; + + u32 thread_index = vlib_get_thread_index (); + pvti_per_thread_data_t *ptd = + vec_elt_at_index (pvm->per_thread_data[is_ip6], thread_index); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + while (n_left_from > 0) + { + u32 bi0; + vlib_buffer_t *b0; + u32 next0 = PVTI_INPUT_NEXT_DROP; + u32 sw_if_index0; + u8 true_chunk_count = 0; + u8 max_chunk_count; + + bi0 = from[0]; + from += 1; + n_left_from -= 1; + + b0 = vlib_get_buffer (vm, bi0); + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + pvti_ip4_encap_header_t *h0 = + ((pvti_ip4_encap_header_t *) vlib_buffer_get_current (b0)) - 1; + pvti_rx_peer_t *pvti_rx_peer0 = + pvti_try_find_or_create_rx_peer (ptd, b0, is_ip6); + if (!pvti_rx_peer0) + { + b0->error = node->errors[PVTI_INPUT_ERROR_PEER]; + goto drop_and_maybe_trace; + } + + b0 = vlib_get_buffer (vm, bi0); + pvti_packet_header_t *pvti0 = vlib_buffer_get_current (b0); + u8 stream_index = pvti0->stream_index; + max_chunk_count = + pvti0->chunk_count < MAX_CHUNKS ? pvti0->chunk_count : MAX_CHUNKS; + u16 pvti_packet_header_sz0 = + pvti0->pad_bytes + offsetof (pvti_packet_header_t, pad); + if (b0->current_length < pvti_packet_header_sz0) + { + b0->error = node->errors[PVTI_INPUT_ERROR_PACKET_TOO_SHORT]; + goto drop_and_maybe_trace; + } + vlib_buffer_advance (b0, pvti_packet_header_sz0); + + if (max_chunk_count == 0) + { + b0->error = node->errors[PVTI_INPUT_ERROR_NOCHUNKS]; + goto drop_and_maybe_trace; + } + if (pvti0->reass_chunk_count > max_chunk_count) + { + b0->error = node->errors[PVTI_INPUT_ERROR_TOOMANYREASS]; + goto drop_and_maybe_trace; + } + pvti_per_rx_stream_data_t *rx_stream0 = + &pvti_rx_peer0->rx_streams[stream_index]; + + u32 new_seq0 = clib_net_to_host_u32 (pvti0->seq); + if (new_seq0 == rx_stream0->last_rx_seq + 1) + { + /* Sequence# matches, we can attempt adding the leading chunks to + * reassembly */ + rx_stream0->last_rx_seq = new_seq0; + + while ((b0->current_length > 0) && + true_chunk_count < pvti0->reass_chunk_count) + { + /* attempt to either incorporate the first chunk into + * reassembly or skip it. */ + pvti_chunk_header_t *pvc0 = vlib_buffer_get_current (b0); + const u16 chunk_payload_length = + clib_net_to_host_u16 (pvc0->total_chunk_length) - + sizeof (*pvc0); + vlib_buffer_advance (b0, sizeof (*pvc0)); + + if (rx_stream0->rx_bi0 == INDEX_INVALID) + { + clib_warning ( + "RX internal error: not-first chunk but no wip block"); + } + else + { + + vlib_buffer_t *rb0 = + vlib_get_buffer (vm, rx_stream0->rx_bi0); + u16 allowed_length = + PVTI_RX_MAX_LENGTH - rb0->current_length; + if (allowed_length > chunk_payload_length) + { + // simple case - there is space in the buffer to fit + // the whole chunk + void *tail = + vlib_buffer_put_uninit (rb0, chunk_payload_length); + clib_memcpy (tail, vlib_buffer_get_current (b0), + chunk_payload_length); + } + else + { + // The current chunk can not fit - need to make two + // copies, one into the current buffer, and one into + // a newly allocated chained buffer. + void *tail = + vlib_buffer_put_uninit (rb0, allowed_length); + clib_memcpy (tail, vlib_buffer_get_current (b0), + allowed_length); + u16 remaining_payload_length = + chunk_payload_length - allowed_length; + u32 nrbi0 = pvti_get_new_buffer (vm); + if (INDEX_INVALID == nrbi0) + { + ASSERT (0); // FIXME what the recovery is + // supposed to look like ? + } + else + { + // link up the new buffer and copy the remainder + // there + vlib_buffer_t *nrb0 = vlib_get_buffer (vm, nrbi0); + rb0->flags |= VLIB_BUFFER_NEXT_PRESENT; + rb0->flags &= ~VLIB_BUFFER_TOTAL_LENGTH_VALID; + rb0->next_buffer = nrbi0; + rx_stream0->rx_bi0 = nrbi0; + void *tail = vlib_buffer_put_uninit ( + nrb0, remaining_payload_length); + clib_memcpy (tail, + vlib_buffer_get_current (b0) + + allowed_length, + remaining_payload_length); + } + } + pvti_rx_peer0->rx_streams[stream_index] + .rx_received_inner_length += chunk_payload_length; + if (pvti_rx_peer0->rx_streams[stream_index] + .rx_received_inner_length == + pvti_rx_peer0->rx_streams[stream_index] + .rx_expected_inner_length) + { + next0 = rx_stream0->rx_next0; + pvti_enqueue_rx_bi_to_next_and_trace ( + vm, node, ptd, rx_stream0->rx_bi0_first, next0); + pkts_decapsulated += 1; + + // clean out the current reassemly state + rx_stream0->rx_bi0 = INDEX_INVALID; + rx_stream0->rx_bi0_first = INDEX_INVALID; + pvti_rx_peer0->rx_streams[stream_index] + .rx_received_inner_length = 0; + pvti_rx_peer0->rx_streams[stream_index] + .rx_expected_inner_length = 0; + rx_stream0->rx_next0 = 0; + } + } + chunks[true_chunk_count] = pvc0; + true_chunk_count += 1; + vlib_buffer_advance (b0, chunk_payload_length); + } + } + else + { + /* Sequence does not match, skip the reassembly chunks and reset + * the reassembly state */ + + while ((b0->current_length > 0) && + true_chunk_count < pvti0->reass_chunk_count) + { + /* skip the reassembly chunks */ + pvti_chunk_header_t *pvc0 = vlib_buffer_get_current (b0); + chunks[true_chunk_count] = pvc0; + true_chunk_count += 1; + vlib_buffer_advance ( + b0, clib_net_to_host_u16 (pvc0->total_chunk_length)); + } + // FIXME: discard the current reassembly state, reset the seq# + if (rx_stream0->rx_bi0_first != INDEX_INVALID) + { + clib_warning ("RX PVTI: discard chunk being reassembled"); + vlib_buffer_free_one (vm, rx_stream0->rx_bi0_first); + rx_stream0->rx_bi0 = INDEX_INVALID; + rx_stream0->rx_bi0_first = INDEX_INVALID; + rx_stream0->rx_received_inner_length = 0; + rx_stream0->rx_expected_inner_length = 0; + rx_stream0->rx_next0 = 0; + } + } + + while ((b0->current_length > 0) && true_chunk_count < max_chunk_count) + { + if (b0->current_length < sizeof (pvti_chunk_header_t)) + { + clib_warning ("RX ERR: length too short for a chunk"); + break; + } + pvti_chunk_header_t *pvc0 = vlib_buffer_get_current (b0); + chunks[true_chunk_count] = pvc0; + true_chunk_count += 1; + u16 total_chunk_length = + clib_net_to_host_u16 (pvc0->total_chunk_length); + if (b0->current_length < total_chunk_length) + { + clib_warning ("RX ERR: length 0x%x too big for a chunk", + true_chunk_count); + break; + } + u8 *pkt = (u8 *) (pvc0 + 1); + u16 inner_length; + if (rx_stream0->rx_bi0_first != INDEX_INVALID) + { + vlib_buffer_free_one (vm, rx_stream0->rx_bi0_first); + rx_stream0->rx_bi0 = INDEX_INVALID; + rx_stream0->rx_bi0_first = INDEX_INVALID; + rx_stream0->rx_received_inner_length = 0; + rx_stream0->rx_expected_inner_length = 0; + rx_stream0->rx_next0 = 0; + } + + switch (*pkt & 0xf0) + { + case 0x40: + next0 = PVTI_INPUT_NEXT_IP4_INPUT; + inner_length = clib_net_to_host_u16 (*((u16 *) (pkt + 2))); + break; + case 0x60: + next0 = PVTI_INPUT_NEXT_IP6_INPUT; + inner_length = clib_net_to_host_u16 (*((u16 *) (pkt + 4))) + + sizeof (ip6_header_t); + break; + default: + next0 = PVTI_INPUT_NEXT_DROP; + vlib_buffer_advance (b0, total_chunk_length); + continue; + } + vlib_buffer_advance (b0, sizeof (pvti_chunk_header_t)); + + if (inner_length + sizeof (pvti_chunk_header_t) > total_chunk_length) + { + /* FIXME: the packet size is larger than the chunk -> it's a + * first fragment */ + // enqueue the chunk and finish packet processing. + // There must be no active reassembly. + ASSERT (rx_stream0->rx_bi0_first == INDEX_INVALID); + rx_stream0->rx_next0 = next0; + rx_stream0->rx_bi0 = bi0; + rx_stream0->rx_bi0_first = bi0; + rx_stream0->rx_expected_inner_length = inner_length; + rx_stream0->rx_received_inner_length = + total_chunk_length - sizeof (pvti_chunk_header_t); + rx_stream0->last_rx_seq = new_seq0; + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + pvti_input_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = ~0; + t->trace_type = PVTI_INPUT_TRACE_enqueue; + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + goto continue_outer; + } + + u32 nbi0 = pvti_get_new_buffer (vm); + if (INDEX_INVALID == nbi0) + { + decap_failed_no_buffers += 1; + continue; + }; + vlib_buffer_t *nb0 = vlib_get_buffer (vm, nbi0); + pvti_if_t *pvti_if0 = pvti_if_get (pvti_rx_peer0->pvti_if_index); + vnet_buffer (nb0)->sw_if_index[VLIB_RX] = pvti_if0->sw_if_index; + void *new_packet = vlib_buffer_put_uninit (nb0, inner_length); + clib_memcpy (new_packet, pvc0 + 1, inner_length); + vlib_buffer_advance (b0, inner_length); + + pvti_enqueue_rx_bi_to_next_and_trace (vm, node, ptd, nbi0, next0); + pkts_decapsulated += 1; + } + /* we have processed all the chunks from the buffer, but the buffer + * remains. Free it. */ + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + pvti_input_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = ~0; + t->trace_type = PVTI_INPUT_TRACE_free; + t->seq = clib_net_to_host_u32 (pvti0->seq); + t->chunk_count = pvti0->chunk_count; + u8 chunk_count = + pvti0->chunk_count < MAX_CHUNKS ? pvti0->chunk_count : MAX_CHUNKS; + for (int i = 0; i < chunk_count; i++) + { + t->chunks[i].total_chunk_length = + clib_net_to_host_u16 (chunks[i]->total_chunk_length); + } + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + vlib_buffer_free_one (vm, bi0); + + continue_outer: + pkts_processed += 1; + continue; + + drop_and_maybe_trace: + next0 = PVTI_INPUT_NEXT_DROP; + + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + int i; + pvti_input_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->trace_type = PVTI_INPUT_TRACE_drop; + t->next_index = next0; + t->remote_ip.ip.ip4 = h0->ip4.src_address; + t->remote_ip.version = AF_IP4; + t->local_port = h0->udp.dst_port; + t->remote_port = h0->udp.src_port; + if (!pvti_rx_peer0) + { + t->seq = 0xdeaddead; + } + else + { + t->seq = clib_net_to_host_u32 (pvti0->seq); + t->chunk_count = pvti0->chunk_count; + u8 chunk_count = pvti0->chunk_count < MAX_CHUNKS ? + pvti0->chunk_count : + MAX_CHUNKS; + for (i = 0; i < chunk_count; i++) + { + t->chunks[i].total_chunk_length = + clib_net_to_host_u16 (chunks[i]->total_chunk_length); + } + } + } + + pkts_processed += 1; + vec_add1 (ptd->pending_rx_buffers, bi0); + vec_add1 (ptd->pending_rx_nexts, next0); + } + + vlib_buffer_enqueue_to_next_vec (vm, node, &ptd->pending_rx_buffers, + &ptd->pending_rx_nexts, + vec_len (ptd->pending_rx_nexts)); + vec_reset_length (ptd->pending_rx_buffers); + vec_reset_length (ptd->pending_rx_nexts); + + vlib_node_increment_counter (vm, node->node_index, + PVTI_INPUT_ERROR_PROCESSED, pkts_processed); + vlib_node_increment_counter ( + vm, node->node_index, PVTI_INPUT_ERROR_DECAPSULATED, pkts_decapsulated); + vlib_node_increment_counter (vm, node->node_index, + PVTI_INPUT_ERROR_NO_BUFFERS, + decap_failed_no_buffers); + return frame->n_vectors; +} + +VLIB_NODE_FN (pvti4_input_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_input_node_common (vm, node, frame, 0); +} + +VLIB_NODE_FN (pvti6_input_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_input_node_common (vm, node, frame, 1); +} diff --git a/src/plugins/pvti/input.h b/src/plugins/pvti/input.h new file mode 100644 index 00000000000..02a186cde05 --- /dev/null +++ b/src/plugins/pvti/input.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_pvti_input_h__ +#define __included_pvti_input_h__ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +typedef struct +{ + u16 total_chunk_length; +} pvti_input_chunk_t; + +#define MAX_CHUNKS 32 +#define PVTI_RX_MAX_LENGTH 2048 + +typedef struct +{ + u32 next_index; + u32 sw_if_index; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + u32 seq; + pvti_input_chunk_t chunks[MAX_CHUNKS]; + u8 chunk_count; + u8 trace_type; + u8 packet_data[64]; +} pvti_input_trace_t; + +#define foreach_pvti_input_trace_type \ + _ (drop, "drop") \ + _ (decap, "decapsulate") \ + _ (free, "free") \ + _ (enqueue, "enqueue") + +typedef enum +{ +#define _(f, s) PVTI_INPUT_TRACE_##f, + foreach_pvti_input_trace_type +#undef _ + PVTI_INPUT_TRACE_N_TYPES, +} pvti_input_trace_type_t; + +#define foreach_pvti_input_error \ + _ (PROCESSED, "PVTI tunneled packets processed") \ + _ (DECAPSULATED, "PVTI inner packets decapsulated") \ + _ (PEER, "Could not find a peer") \ + _ (NOCHUNKS, "Packet has no chunks") \ + _ (NO_BUFFERS, "No buffers available to decapsulate") \ + _ (TOOMANYREASS, "Packet has more reassembly chunks than total") \ + _ (PACKET_TOO_SHORT, "Packet too short") + +typedef enum +{ +#define _(sym, str) PVTI_INPUT_ERROR_##sym, + foreach_pvti_input_error +#undef _ + PVTI_INPUT_N_ERROR, +} pvti_input_error_t; + +typedef enum +{ + PVTI_INPUT_NEXT_DROP, + PVTI_INPUT_NEXT_IP4_INPUT, + PVTI_INPUT_NEXT_IP6_INPUT, + PVTI_INPUT_NEXT_PUNT, + PVTI_INPUT_N_NEXT, +} pvti_input_next_t; + +#endif // pvti_input_h diff --git a/src/plugins/pvti/output-main.c b/src/plugins/pvti/output-main.c new file mode 100644 index 00000000000..ae4ae5f8e98 --- /dev/null +++ b/src/plugins/pvti/output-main.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <pvti/output.h> + +/* packet trace format function */ +static u8 * +format_pvti_output_trace (u8 *s, va_list *args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + pvti_output_trace_t *t = va_arg (*args, pvti_output_trace_t *); + + u32 indent = format_get_indent (s); + s = + format (s, "PVTI-OUT(%d): sw_if_index %d, next index %d, underlay_mtu %d,", + t->trace_type, t->sw_if_index, t->next_index, t->underlay_mtu); + s = format (s, "\n%U stream_index %d, bi0_max_current_length %d, tx_seq %d", + format_white_space, indent, t->stream_index, + t->bi0_max_current_length, t->tx_seq); + s = format (s, "\n%U%U", format_white_space, indent, + format_ip_adjacency_packet_data, t->packet_data, + sizeof (t->packet_data)); + + return s; +} + +vlib_node_registration_t pvti_output_node; + +static char *pvti_output_error_strings[] = { +#define _(sym, string) string, + foreach_pvti_output_error +#undef _ +}; + +VLIB_REGISTER_NODE (pvti4_output_node) = +{ + .name = "pvti4-output", + .vector_size = sizeof (u32), + .format_trace = format_pvti_output_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(pvti_output_error_strings), + .error_strings = pvti_output_error_strings, + + .n_next_nodes = PVTI_OUTPUT_N_NEXT, + + .next_nodes = { + [PVTI_OUTPUT_NEXT_DROP] = "error-drop", + [PVTI_OUTPUT_NEXT_INTERFACE_OUTPUT] = "adj-midchain-tx", + [PVTI_OUTPUT_NEXT_IP4_LOOKUP] = "ip4-lookup", + [PVTI_OUTPUT_NEXT_IP6_LOOKUP] = "ip6-lookup", + }, + +}; +VLIB_REGISTER_NODE (pvti6_output_node) = +{ + .name = "pvti6-output", + .vector_size = sizeof (u32), + .format_trace = format_pvti_output_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + .n_errors = ARRAY_LEN(pvti_output_error_strings), + .error_strings = pvti_output_error_strings, + + .n_next_nodes = PVTI_OUTPUT_N_NEXT, + + .next_nodes = { + [PVTI_OUTPUT_NEXT_DROP] = "error-drop", + [PVTI_OUTPUT_NEXT_INTERFACE_OUTPUT] = "adj-midchain-tx", + [PVTI_OUTPUT_NEXT_IP4_LOOKUP] = "ip4-lookup", + [PVTI_OUTPUT_NEXT_IP6_LOOKUP] = "ip6-lookup", + }, + +}; diff --git a/src/plugins/pvti/output.c b/src/plugins/pvti/output.c new file mode 100644 index 00000000000..1939c6f585a --- /dev/null +++ b/src/plugins/pvti/output.c @@ -0,0 +1,543 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> +#include <pvti/output.h> + +static_always_inline u32 +ip6_vtcfl (u8 stream_index) +{ + u32 vtcfl = 0x6 << 28; + vtcfl |= stream_index; + + return (clib_host_to_net_u32 (vtcfl)); +} + +always_inline vlib_buffer_t * +pvti_alloc_new_tx_buffer (vlib_main_t *vm) +{ + u32 bi0 = INDEX_INVALID; + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + { + return 0; + } + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + b0->current_data = 0; + b0->current_length = 0; + return b0; +} + +always_inline bool +pvti_find_or_try_create_tx_peer (vlib_main_t *vm, pvti_per_thread_data_t *ptd, + pvti_if_t *pvti_if0, ip_address_t *remote_ip, + u16 remote_port, u32 *out_index) +{ + + pvti_tx_peer_t *peer; + pool_foreach (peer, ptd->tx_peers) + { + if (peer->remote_port == remote_port && + 0 == ip_address_cmp (remote_ip, &peer->remote_ip)) + { + if (peer->deleted) + { + // Bad luck, the peer has been deleted. + u32 boi0 = vlib_get_buffer_index (vm, peer->bo0); + if (peer->bo0) + { + vlib_buffer_free (vm, &boi0, 1); + } + clib_memset (peer, 0xca, sizeof (*peer)); + pool_put (ptd->tx_peers, peer); + continue; + } + *out_index = peer - ptd->tx_peers; + return 1; + } + } + + ip_address_family_t dst_ver = ip_addr_version (&pvti_if0->remote_ip); + + u16 pvti_encap_overhead = (dst_ver == AF_IP6) ? + sizeof (pvti_ip6_encap_header_t) : + sizeof (pvti_ip4_encap_header_t); + + u16 pvti_packet_overhead = + pvti_encap_overhead + sizeof (pvti_packet_header_t) + PVTI_ALIGN_BYTES; + + ASSERT (pvti_if0->underlay_mtu > pvti_packet_overhead); + + u32 bo0_max_current_length = pvti_if0->underlay_mtu - pvti_packet_overhead; + + vlib_buffer_t *bo0 = pvti_alloc_new_tx_buffer (vm); + + if (!bo0) + { + return 0; + } + + pvti_tx_peer_t new_peer = { + .local_ip = pvti_if0->local_ip, + .remote_ip = *remote_ip, + .local_port = pvti_if0->local_port, + .remote_port = remote_port, + .underlay_mtu = pvti_if0->underlay_mtu, + .underlay_fib_index = pvti_if0->underlay_fib_index, + .bo0_max_current_length = bo0_max_current_length, + .pvti_if_index = pvti_if_get_index (pvti_if0), + .deleted = 0, + .bo0 = bo0, + .chunk_count = 0, + .reass_chunk_count = 0, + .current_tx_seq = 42, + }; + + pvti_tx_peer_t *tx_new_peer; + pool_get (ptd->tx_peers, tx_new_peer); + + *tx_new_peer = new_peer; + *out_index = tx_new_peer - ptd->tx_peers; + return 1; +} + +always_inline bool +pvti_try_get_tx_peer_index (vlib_main_t *vm, pvti_per_thread_data_t *ptd, + pvti_if_t *pvti_if0, vlib_buffer_t *b0, + bool is_ip6, u32 *out_index) +{ + if (pvti_if0->peer_address_from_payload) + { + ip_address_t remote_ip = { 0 }; + if (is_ip6) + { + ip6_header_t *ip6 = vlib_buffer_get_current (b0); + ip_address_set (&remote_ip, &ip6->dst_address, AF_IP6); + } + else + { + ip4_header_t *ip4 = vlib_buffer_get_current (b0); + ip_address_set (&remote_ip, &ip4->dst_address, AF_IP4); + } + return pvti_find_or_try_create_tx_peer ( + vm, ptd, pvti_if0, &remote_ip, pvti_if0->remote_port, out_index); + } + else + { + return pvti_find_or_try_create_tx_peer ( + vm, ptd, pvti_if0, &pvti_if0->remote_ip, pvti_if0->remote_port, + out_index); + } + /* not reached */ +} + +always_inline void +pvti_finalize_chunk (pvti_tx_peer_t *tx_peer, + pvti_chunk_header_t *chunk_header, u8 *tail, + bool is_reassembly_chunk) +{ + clib_memset (chunk_header, 0xab, sizeof (pvti_chunk_header_t)); + chunk_header->total_chunk_length = + clib_host_to_net_u16 (tail - (u8 *) chunk_header); + tx_peer->chunk_count++; + if (is_reassembly_chunk) + { + tx_peer->reass_chunk_count++; + } +} + +always_inline pvti_output_next_t +encap_pvti_buffer_ip46 (vlib_main_t *vm, vlib_node_runtime_t *node, + pvti_tx_peer_t *tx_peer, int is_ip6) +{ + ip_address_family_t src_ver = ip_addr_version (&tx_peer->local_ip); + ip_address_family_t dst_ver = ip_addr_version (&tx_peer->remote_ip); + u8 stream_index = 0; + + ASSERT (src_ver == dst_ver); + bool is_ip6_encap = (AF_IP6 == src_ver); + + vlib_buffer_t *b0 = tx_peer->bo0; + vlib_buffer_advance (b0, + -(sizeof (pvti_packet_header_t) + PVTI_ALIGN_BYTES)); + + pvti_packet_header_t *pvti0 = vlib_buffer_get_current (b0); + clib_memset (pvti0, 0xca, sizeof (*pvti0) + PVTI_ALIGN_BYTES); + pvti0->pad_bytes = PVTI_ALIGN_BYTES; + + pvti0->seq = clib_host_to_net_u32 (tx_peer->current_tx_seq); + pvti0->stream_index = stream_index; + pvti0->reass_chunk_count = tx_peer->reass_chunk_count; + pvti0->chunk_count = tx_peer->chunk_count; + pvti0->mandatory_flags_mask = 0; + pvti0->flags_value = 0; + + if (is_ip6_encap) + { + vlib_buffer_advance (b0, -(sizeof (pvti_ip6_encap_header_t))); + if (b0->current_data < -VLIB_BUFFER_PRE_DATA_SIZE) + { + // undo the change + vlib_buffer_advance (b0, (sizeof (pvti_ip6_encap_header_t))); + b0->error = node->errors[PVTI_OUTPUT_ERROR_NO_PRE_SPACE]; + return PVTI_OUTPUT_NEXT_DROP; + } + pvti_ip6_encap_header_t *ve = vlib_buffer_get_current (b0); + + ve->udp.src_port = clib_host_to_net_u16 (tx_peer->local_port); + ve->udp.dst_port = clib_host_to_net_u16 (tx_peer->remote_port); + ve->udp.length = clib_host_to_net_u16 ( + b0->current_length - offsetof (pvti_ip6_encap_header_t, udp)); + ve->udp.checksum = 0; + + ve->ip6.ip_version_traffic_class_and_flow_label = + ip6_vtcfl (stream_index); + ve->ip6.payload_length = ve->udp.length; + ve->ip6.protocol = 17; + ve->ip6.hop_limit = 128; + ip_address_copy_addr (&ve->ip6.src_address, &tx_peer->local_ip); + ip_address_copy_addr (&ve->ip6.dst_address, &tx_peer->remote_ip); + } + else + { + vlib_buffer_advance (b0, -(sizeof (pvti_ip4_encap_header_t))); + if (b0->current_data < -VLIB_BUFFER_PRE_DATA_SIZE) + { + // undo the change + vlib_buffer_advance (b0, (sizeof (pvti_ip4_encap_header_t))); + b0->error = node->errors[PVTI_OUTPUT_ERROR_NO_PRE_SPACE]; + return PVTI_OUTPUT_NEXT_DROP; + } + pvti_ip4_encap_header_t *ve = vlib_buffer_get_current (b0); + + ve->udp.src_port = clib_host_to_net_u16 (tx_peer->local_port); + ve->udp.dst_port = clib_host_to_net_u16 (tx_peer->remote_port); + ve->udp.length = clib_host_to_net_u16 ( + b0->current_length - offsetof (pvti_ip4_encap_header_t, udp)); + ve->udp.checksum = 0; + + ve->ip4.ip_version_and_header_length = 0x45; + ve->ip4.tos = 0; + ve->ip4.length = clib_host_to_net_u16 (b0->current_length); + ve->ip4.fragment_id = + clib_host_to_net_u16 (tx_peer->current_tx_seq & 0xffff); + ve->ip4.flags_and_fragment_offset = 0; + ve->ip4.ttl = 128; + ve->ip4.protocol = 17; + + ve->ip4.dst_address.as_u32 = ip_addr_v4 (&tx_peer->remote_ip).data_u32; + ve->ip4.src_address.as_u32 = ip_addr_v4 (&tx_peer->local_ip).data_u32; + ve->ip4.checksum = ip4_header_checksum (&ve->ip4); + } + + // This is important, if not reset, causes a crash + vnet_buffer (b0)->sw_if_index[VLIB_TX] = tx_peer->underlay_fib_index; + + // vnet_buffer (b0)->oflags |= VNET_BUFFER_OFFLOAD_F_IP_CKSUM; + return is_ip6_encap ? PVTI_OUTPUT_NEXT_IP6_LOOKUP : + PVTI_OUTPUT_NEXT_IP4_LOOKUP; +} + +always_inline void +pvti_enqueue_tx_and_trace (vlib_main_t *vm, vlib_node_runtime_t *node, + pvti_per_thread_data_t *ptd, vlib_buffer_t *b0, + u16 next0, u8 stream_index, pvti_tx_peer_t *tx_peer) +{ + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + tx_peer->is_bo0_traced)) + { + if (PREDICT_TRUE ( + vlib_trace_buffer (vm, node, next0, b0, /* follow_chain */ 0))) + { + + pvti_output_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + t->underlay_mtu = tx_peer->underlay_mtu; + t->stream_index = stream_index; + t->trace_type = 1; + t->bi0_max_current_length = tx_peer->bo0_max_current_length; + t->tx_seq = tx_peer->current_tx_seq; + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + } + u32 bi0 = vlib_get_buffer_index (vm, b0); + vec_add1 (ptd->pending_tx_buffers, bi0); + vec_add1 (ptd->pending_tx_nexts, next0); +} + +always_inline void +pvti_enqueue_tx_drop_and_trace (vlib_main_t *vm, vlib_node_runtime_t *node, + pvti_per_thread_data_t *ptd, vlib_buffer_t *b0, + u8 stream_index) +{ + if (PREDICT_FALSE ((node->flags & VLIB_NODE_FLAG_TRACE) && + (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + pvti_output_trace_t *t = vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = PVTI_OUTPUT_NEXT_DROP; + t->stream_index = stream_index; + t->trace_type = 0; + clib_memcpy (t->packet_data, vlib_buffer_get_current (b0), + sizeof (t->packet_data)); + } + u32 bi0 = vlib_get_buffer_index (vm, b0); + vec_add1 (ptd->pending_tx_buffers, bi0); + vec_add1 (ptd->pending_tx_nexts, PVTI_OUTPUT_NEXT_DROP); +} + +always_inline bool +pvti_flush_peer_and_recharge (vlib_main_t *vm, vlib_node_runtime_t *node, + pvti_per_thread_data_t *ptd, u32 tx_peer_index, + u8 stream_index, const bool is_ip6) +{ + pvti_tx_peer_t *tx_peer = pool_elt_at_index (ptd->tx_peers, tx_peer_index); + u16 next0 = encap_pvti_buffer_ip46 (vm, node, tx_peer, is_ip6); + + pvti_enqueue_tx_and_trace (vm, node, ptd, tx_peer->bo0, next0, stream_index, + tx_peer); + + tx_peer->bo0 = pvti_alloc_new_tx_buffer (vm); + tx_peer->reass_chunk_count = 0; + tx_peer->chunk_count = 0; + tx_peer->current_tx_seq++; + + return 1; +} + +always_inline u16 +pvti_output_node_common (vlib_main_t *vm, vlib_node_runtime_t *node, + vlib_frame_t *frame, const bool is_ip6) +{ + pvti_main_t *pvm = &pvti_main; + + u32 n_left_from, *from; + u32 pkts_encapsulated = 0; + u32 pkts_processed = 0; + u32 pkts_chopped = 0; + u32 pkts_overflow = 0; + u32 pkts_overflow_cantfit = 0; + + bool is_node_traced = (node->flags & VLIB_NODE_FLAG_TRACE) ? 1 : 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + u8 stream_index = pvti_get_stream_index (is_ip6); + + u32 thread_index = vlib_get_thread_index (); + pvti_per_thread_data_t *ptd = + vec_elt_at_index (pvm->per_thread_data[is_ip6], thread_index); + + vlib_buffer_t *ibufs[VLIB_FRAME_SIZE], **ib = ibufs; + + vlib_get_buffers (vm, from, ibufs, n_left_from); + + n_left_from = frame->n_vectors; + while (1 && n_left_from > 0) + { + n_left_from -= 1; + vlib_buffer_t *b0 = ib[0]; + ib++; + u32 bi0 = vlib_get_buffer_index (vm, b0); + bool is_b0_traced = + is_node_traced && ((b0->flags & VLIB_BUFFER_IS_TRACED) ? 1 : 0); + pkts_processed += 1; + + u32 sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_TX]; + u32 pvti_index0 = pvti_if_find_by_sw_if_index (sw_if_index0); + if (pvti_index0 == INDEX_INVALID) + { + b0->error = node->errors[PVTI_OUTPUT_ERROR_PEER]; + pvti_enqueue_tx_drop_and_trace (vm, node, ptd, b0, stream_index); + continue; + } + pvti_if_t *pvti_if0 = pvti_if_get (pvti_index0); + u32 tx_peer_index; + if (!pvti_try_get_tx_peer_index (vm, ptd, pvti_if0, b0, is_ip6, + &tx_peer_index)) + { + b0->error = node->errors[PVTI_OUTPUT_ERROR_MAKE_PEER]; + pvti_enqueue_tx_drop_and_trace (vm, node, ptd, b0, stream_index); + continue; + } + pvti_tx_peer_t *tx_peer = &ptd->tx_peers[tx_peer_index]; + + u32 b0_len = vlib_buffer_length_in_chain (vm, b0); + u32 total_chunk_len = sizeof (pvti_chunk_header_t) + b0_len; + + if (tx_peer->bo0_max_current_length >= + tx_peer->bo0->current_length + total_chunk_len) + { + /* Happy case, we can fit the entire new chunk */ + pvti_chunk_header_t *chunk_header = vlib_buffer_put_uninit ( + tx_peer->bo0, sizeof (pvti_chunk_header_t)); + u8 *tail = vlib_buffer_put_uninit (tx_peer->bo0, b0_len); + vlib_buffer_t *b0_curr; + b0_curr = b0; + while (b0_len > 0) + { + clib_memcpy (tail, vlib_buffer_get_current (b0_curr), + b0_curr->current_length); + tail += b0_curr->current_length; + b0_len -= b0_curr->current_length; + ASSERT ((b0_len == 0) || + (b0_curr->flags & VLIB_BUFFER_NEXT_PRESENT)); + if (b0_curr->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b0_curr = vlib_get_buffer (vm, b0_curr->next_buffer); + } + } + tx_peer->is_bo0_traced |= is_b0_traced; + pvti_finalize_chunk (tx_peer, chunk_header, tail, false); + } + else + { + bool is_reassembly = false; + /* FIXME: here, flush a packet if we want to avoid fragmenting it */ +#define PVTI_TINY_PACKET_SZ 20 + int threshold_len = + sizeof (pvti_chunk_header_t) + PVTI_TINY_PACKET_SZ; + + /* Can we fit anything meaningful into bo0 ? if not - flush */ + if (tx_peer->bo0_max_current_length <= + tx_peer->bo0->current_length + threshold_len) + { + if (!pvti_flush_peer_and_recharge (vm, node, ptd, tx_peer_index, + stream_index, is_ip6)) + { + b0->error = node->errors[PVTI_OUTPUT_ERROR_RECHARGE0]; + pvti_enqueue_tx_drop_and_trace (vm, node, ptd, b0, + stream_index); + continue; + } + pkts_encapsulated += 1; + } + + pvti_chunk_header_t *chunk_header = vlib_buffer_put_uninit ( + tx_peer->bo0, sizeof (pvti_chunk_header_t)); + + u8 *tail; + vlib_buffer_t *b0_curr; + /* append the chained buffers and flush as necessary */ + b0_curr = b0; + + int curr_b0_start_offset = 0; + + while (b0_len > 0) + { + ASSERT (tx_peer->bo0_max_current_length > + tx_peer->bo0->current_length); + int copy_len = + clib_min (b0_curr->current_length - curr_b0_start_offset, + tx_peer->bo0_max_current_length - + tx_peer->bo0->current_length); + tail = vlib_buffer_put_uninit (tx_peer->bo0, copy_len); + clib_memcpy (tail, + (u8 *) vlib_buffer_get_current (b0_curr) + + curr_b0_start_offset, + copy_len); + tail += copy_len; + b0_len -= copy_len; + // Advance the start offset or reset it if we copied the entire + // block + curr_b0_start_offset = + curr_b0_start_offset + copy_len == b0_curr->current_length ? + 0 : + curr_b0_start_offset + copy_len; + ASSERT ((b0_len == 0) || (curr_b0_start_offset > 0) || + (b0_curr->flags & VLIB_BUFFER_NEXT_PRESENT)); + if (curr_b0_start_offset > 0) + { + pvti_finalize_chunk (tx_peer, chunk_header, tail, + is_reassembly); + tx_peer->is_bo0_traced |= is_b0_traced; + if (!pvti_flush_peer_and_recharge ( + vm, node, ptd, tx_peer_index, stream_index, is_ip6)) + { + b0->error = node->errors[PVTI_OUTPUT_ERROR_RECHARGE1]; + pvti_enqueue_tx_drop_and_trace (vm, node, ptd, b0, + stream_index); + continue; + } + pkts_encapsulated += 1; + /* next chunk(s) will be reassembly until the next block */ + is_reassembly = true; + chunk_header = vlib_buffer_put_uninit ( + tx_peer->bo0, sizeof (pvti_chunk_header_t)); + } + else + { + if ((b0_curr->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + b0_curr = vlib_get_buffer (vm, b0_curr->next_buffer); + } + else + { + pvti_finalize_chunk (tx_peer, chunk_header, tail, + is_reassembly); + tx_peer->is_bo0_traced |= is_b0_traced; + } + } + } + } + vlib_buffer_free_one (vm, bi0); + } + + int i; + for (i = 0; i < vec_len (ptd->tx_peers); i++) + { + if (ptd->tx_peers[i].chunk_count) + { + pvti_flush_peer_and_recharge (vm, node, ptd, i, stream_index, + is_ip6); + pkts_encapsulated += 1; + } + } + + vlib_buffer_enqueue_to_next_vec (vm, node, &ptd->pending_tx_buffers, + &ptd->pending_tx_nexts, + vec_len (ptd->pending_tx_nexts)); + vec_reset_length (ptd->pending_tx_buffers); + vec_reset_length (ptd->pending_tx_nexts); + + vlib_node_increment_counter ( + vm, node->node_index, PVTI_OUTPUT_ERROR_ENCAPSULATED, pkts_encapsulated); + vlib_node_increment_counter (vm, node->node_index, + PVTI_OUTPUT_ERROR_PROCESSED, pkts_processed); + vlib_node_increment_counter (vm, node->node_index, PVTI_OUTPUT_ERROR_CHOPPED, + pkts_chopped); + vlib_node_increment_counter (vm, node->node_index, + PVTI_OUTPUT_ERROR_OVERFLOW, pkts_overflow); + vlib_node_increment_counter (vm, node->node_index, + PVTI_OUTPUT_ERROR_OVERFLOW_CANTFIT, + pkts_overflow_cantfit); + return frame->n_vectors; +} + +VLIB_NODE_FN (pvti4_output_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_output_node_common (vm, node, frame, 0); +} + +VLIB_NODE_FN (pvti6_output_node) +(vlib_main_t *vm, vlib_node_runtime_t *node, vlib_frame_t *frame) +{ + return pvti_output_node_common (vm, node, frame, 1); +} diff --git a/src/plugins/pvti/output.h b/src/plugins/pvti/output.h new file mode 100644 index 00000000000..95e78ba9720 --- /dev/null +++ b/src/plugins/pvti/output.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_pvti_output_h__ +#define __included_pvti_output_h__ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +typedef struct +{ + u32 next_index; + u32 sw_if_index; + u32 tx_seq; + u16 underlay_mtu; + u16 bi0_max_current_length; + u8 stream_index; + u8 trace_type; + u8 packet_data[96]; +} pvti_output_trace_t; + +#define foreach_pvti_output_error \ + _ (NONE, "No error") \ + _ (PROCESSED, "Packets processed") \ + _ (ENCAPSULATED, "Packets encapsulated") \ + _ (PEER, "No peer found") \ + _ (MAKE_PEER, "Could not make peer") \ + _ (RECHARGE0, "Could not recharge 0") \ + _ (RECHARGE1, "Could not recharge 1") \ + _ (NO_PRE_SPACE, "Not enought pre-data space") \ + _ (CHOPPED, "Packets chopped") \ + _ (OVERFLOW, "Packets overflowed") \ + _ (OVERFLOW_CANTFIT, "Packets overflowed and cant fit excess") + +typedef enum +{ +#define _(sym, str) PVTI_OUTPUT_ERROR_##sym, + foreach_pvti_output_error +#undef _ + PVTI_OUTPUT_N_ERROR, +} pvti_output_error_t; + +typedef enum +{ + PVTI_INDEPENDENT_CHUNK = 0, + PVTI_REASS_CHUNK, +} pvti_chunk_type_t; + +#define MAX_CURR_LEN_UNKNOWN 0xffff + +typedef enum +{ + PVTI_OUTPUT_NEXT_DROP, + PVTI_OUTPUT_NEXT_INTERFACE_OUTPUT, + PVTI_OUTPUT_NEXT_IP4_LOOKUP, + PVTI_OUTPUT_NEXT_IP6_LOOKUP, + PVTI_OUTPUT_N_NEXT, +} pvti_output_next_t; + +#endif // pvti_output_h diff --git a/src/plugins/pvti/pvti.api b/src/plugins/pvti/pvti.api new file mode 100644 index 00000000000..859ed1ab6b0 --- /dev/null +++ b/src/plugins/pvti/pvti.api @@ -0,0 +1,111 @@ +/* + * pvti.api - binary API skeleton + * + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file pvti.api + * @brief VPP control-plane API messages. + * + * This file defines VPP control-plane binary API messages which are generally + * called through a shared memory interface. + */ + +/* Version and type recitations */ + +option version = "0.0.1"; +import "vnet/interface_types.api"; +import "vnet/ip/ip_types.api"; + +/** \brief A composite type uniquely defining a PVTI tunnel. + @param sw_if_index - ignored on create/delete, present in details. + @param src_ip - Source IP address + @param src_port - Source UDP port + @param dst_ip - Destination IP address + @param dst_port - Destination UDP port + @param underlay_mtu - Underlay MTU for packet splitting/coalescing + @param underlay_fib_index - Underlay FIB index to be used after encap +*/ +typedef pvti_tunnel +{ + vl_api_interface_index_t sw_if_index; + vl_api_address_t local_ip; + u16 local_port; + vl_api_address_t remote_ip; + bool peer_address_from_payload; + u16 remote_port; + u16 underlay_mtu; + u32 underlay_fib_index; +}; + + +/** @brief API to enable / disable pvti on an interface + @param client_index - opaque cookie to identify the sender + @param context - sender context, to match reply w/ request + @param enable_disable - 1 to enable, 0 to disable the feature + @param sw_if_index - interface handle +*/ + +define pvti_interface_create +{ + option status="in_progress"; + + /* Client identifier, set from api_main.my_client_index */ + u32 client_index; + + /* Arbitrary context, so client can match reply to request */ + u32 context; + vl_api_pvti_tunnel_t interface; +}; + +define pvti_interface_create_reply +{ + option status="in_progress"; + u32 context; + i32 retval; + + /* Index for the newly created interface */ + vl_api_interface_index_t sw_if_index; +}; + +autoreply define pvti_interface_delete { + option status="in_progress"; + + /* Client identifier, set from api_main.my_client_index */ + u32 client_index; + + /* Arbitrary context, so client can match reply to request */ + u32 context; + + vl_api_interface_index_t sw_if_index; +}; + + +define pvti_interface_dump +{ + option status="in_progress"; + u32 client_index; + u32 context; + vl_api_interface_index_t sw_if_index; +}; + +define pvti_interface_details +{ + option status="in_progress"; + u32 context; + vl_api_pvti_tunnel_t interface; +}; + + diff --git a/src/plugins/pvti/pvti.c b/src/plugins/pvti/pvti.c new file mode 100644 index 00000000000..524eabc6f3f --- /dev/null +++ b/src/plugins/pvti/pvti.c @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/vnet.h> +#include <vnet/plugin/plugin.h> +#include <vnet/fib/fib_table.h> +#include <pvti/pvti.h> + +#include <vlibapi/api.h> +#include <vlibmemory/api.h> +#include <vpp/app/version.h> +#include <stdbool.h> + +#include <pvti/pvti.api_enum.h> +#include <pvti/pvti.api_types.h> + +#include <pvti/pvti_if.h> + +#define REPLY_MSG_ID_BASE pmp->msg_id_base +#include <vlibapi/api_helper_macros.h> +#include <vnet/ip/ip_format_fns.h> + +pvti_main_t pvti_main; + +u8 * +format_pvti_tx_peer_ptr (u8 *s, va_list *args) +{ + pvti_tx_peer_t *peer = va_arg (*args, pvti_tx_peer_t *); + + s = format ( + s, + "[%p]%s local:%U:%d remote:%U:%d underlay_mtu:%d underlay_fib_idx:%d " + "pvti_idx:%d b0_max_clen:%d cseq:%d chunk_count:%d reass_chunk_count:%d", + peer, peer->deleted ? " DELETED" : "", format_ip46_address, + &peer->local_ip, IP46_TYPE_ANY, peer->local_port, format_ip46_address, + &peer->remote_ip, IP46_TYPE_ANY, peer->remote_port, peer->underlay_mtu, + peer->underlay_fib_index, peer->pvti_if_index, + peer->bo0_max_current_length, peer->current_tx_seq, peer->chunk_count, + peer->reass_chunk_count); + + return (s); +} + +u8 * +format_pvti_rx_peer_ptr (u8 *s, va_list *args) +{ + pvti_rx_peer_t *peer = va_arg (*args, pvti_rx_peer_t *); + + s = format (s, "[%p]%s local:%U:%d remote:%U:%d pvti_idx:%d", peer, + peer->deleted ? " DELETED" : "", format_ip46_address, + &peer->local_ip, IP46_TYPE_ANY, peer->local_port, + format_ip46_address, &peer->remote_ip, IP46_TYPE_ANY, + peer->remote_port, peer->pvti_if_index); + + return (s); +} + +void +pvti_verify_initialized (pvti_main_t *pvm) +{ + if (!pvm->is_initialized) + { + const int n_threads = vlib_get_n_threads (); + vec_validate (pvm->per_thread_data[0], n_threads - 1); + vec_validate (pvm->per_thread_data[1], n_threads - 1); + pvm->is_initialized = 1; + } +} + +void +vnet_int_pvti_bypass_mode (u32 sw_if_index, u8 is_ip6, u8 is_enable) +{ + pvti_main_t *pvm = &pvti_main; + + if (pool_is_free_index (pvm->vnet_main->interface_main.sw_interfaces, + sw_if_index)) + return; + + pvti_verify_initialized (pvm); + + is_enable = !!is_enable; + + if (is_ip6) + { + if (clib_bitmap_get (pvm->bm_ip6_bypass_enabled_by_sw_if, sw_if_index) != + is_enable) + { + vnet_feature_enable_disable ("ip6-unicast", "ip6-pvti-bypass", + sw_if_index, is_enable, 0, 0); + pvm->bm_ip6_bypass_enabled_by_sw_if = clib_bitmap_set ( + pvm->bm_ip6_bypass_enabled_by_sw_if, sw_if_index, is_enable); + } + } + else + { + if (clib_bitmap_get (pvm->bm_ip4_bypass_enabled_by_sw_if, sw_if_index) != + is_enable) + { + vnet_feature_enable_disable ("ip4-unicast", "ip4-pvti-bypass", + sw_if_index, is_enable, 0, 0); + pvm->bm_ip4_bypass_enabled_by_sw_if = clib_bitmap_set ( + pvm->bm_ip4_bypass_enabled_by_sw_if, sw_if_index, is_enable); + } + } +} + +static clib_error_t * +set_ip_pvti_bypass (u32 is_ip6, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + vnet_main_t *vnm = vnet_get_main (); + clib_error_t *error = 0; + u32 sw_if_index, is_enable; + + sw_if_index = ~0; + is_enable = 1; + + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat_user (line_input, unformat_vnet_sw_interface, vnm, + &sw_if_index)) + ; + else if (unformat (line_input, "del")) + is_enable = 0; + else + { + error = unformat_parse_error (line_input); + goto done; + } + } + + if (~0 == sw_if_index) + { + error = clib_error_return (0, "unknown interface `%U'", + format_unformat_error, line_input); + goto done; + } + + vnet_int_pvti_bypass_mode (sw_if_index, is_ip6, is_enable); + +done: + unformat_free (line_input); + + return error; +} + +static clib_error_t * +set_ip4_pvti_bypass (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + return set_ip_pvti_bypass (0, input, cmd); +} + +VLIB_CLI_COMMAND (set_interface_ip_pvti_bypass_command, static) = { + .path = "set interface ip pvti-bypass", + .function = set_ip4_pvti_bypass, + .short_help = "set interface ip pvti-bypass <interface> [del]", +}; + +static clib_error_t * +set_ip6_pvti_bypass (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + return set_ip_pvti_bypass (1, input, cmd); +} + +VLIB_CLI_COMMAND (set_interface_ip6_pvti_bypass_command, static) = { + .path = "set interface ip6 pvti-bypass", + .function = set_ip6_pvti_bypass, + .short_help = "set interface ip6 pvti-bypass <interface> [del]", +}; + +static clib_error_t * +pvti_interface_create_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + unformat_input_t _line_input, *line_input = &_line_input; + clib_error_t *error = 0; + + // pvti_main_t * pmp = &pvti_main; + u32 sw_if_index = ~0; + int rv = 0; + ip_address_t peer_ip = { 0 }; + ip_address_t local_ip = { 0 }; + u32 peer_port = 0; + u32 local_port = 12345; + u32 underlay_mtu = 1500; + u32 underlay_fib_index = ~0; + u32 underlay_table_id = ~0; + pvti_peer_address_method_t peer_address_method = PVTI_PEER_ADDRESS_FIXED; + bool peer_set = 0; + + /* Get a line of input. */ + if (!unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "peer %U %d %d", unformat_ip_address, &peer_ip, + &peer_port, &local_port)) + { + peer_set = 1; + } + else if (unformat (line_input, "underlay-mtu %d", &underlay_mtu)) + { + // MTU set + } + else if (unformat (line_input, "local-ip %U", unformat_ip_address, + &local_ip)) + { + // local IP set + } + else if (unformat (line_input, "underlay-fib %d", &underlay_fib_index)) + { + // underlay fib set + } + else if (unformat (line_input, "peer-address-from-payload")) + { + peer_address_method = PVTI_PEER_ADDRESS_FROM_PAYLOAD; + } + else if (unformat (line_input, "underlay-table %d", &underlay_table_id)) + { + fib_protocol_t fib_proto = FIB_PROTOCOL_IP4; + if (peer_ip.version == AF_IP6) + { + fib_proto = FIB_PROTOCOL_IP6; + } + u32 fib_index = fib_table_find (fib_proto, underlay_table_id); + + if (~0 == fib_index) + { + error = clib_error_return (0, "Nonexistent table id %d", + underlay_table_id); + goto done; + } + underlay_fib_index = fib_index; + } + else + break; + } + if (!peer_set) + { + error = clib_error_return (0, "Please specify a peer..."); + goto done; + } + + rv = pvti_if_create (&local_ip, local_port, &peer_ip, peer_port, + peer_address_method, underlay_mtu, underlay_fib_index, + &sw_if_index); + + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_INVALID_SW_IF_INDEX: + error = clib_error_return (0, "Invalid interface"); + break; + + default: + error = clib_error_return (0, "pvti_if_create returned %d", rv); + } +done: + unformat_free (line_input); + return error; +} + +static clib_error_t * +pvti_interface_delete_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + // pvti_main_t * pmp = &pvti_main; + u32 sw_if_index = ~0; + int rv = 0; + bool if_index_set = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "if-index %d", &sw_if_index)) + { + if_index_set = 1; + } + else + break; + } + if (!if_index_set) + return clib_error_return (0, "Please specify a sw_if_index..."); + + rv = pvti_if_delete (sw_if_index); + + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_INVALID_SW_IF_INDEX: + return clib_error_return (0, "Invalid interface"); + break; + + default: + return clib_error_return (0, "pvti_if_delete returned %d", rv); + } + return 0; +} + +VLIB_CLI_COMMAND (pvti_interface_create_command, static) = { + .path = "pvti interface create", + .short_help = + "pvti interface create peer <remote-ip> <remote-port> <local-port> [ " + "local-ip <ip-addr> ][ underlay-mtu <MTU>][underlay-table " + "<table-index>][inderlay-fib <fib-index>]", + .function = pvti_interface_create_command_fn, +}; + +VLIB_CLI_COMMAND (pvti_interface_delete_command, static) = { + .path = "pvti interface delete", + .short_help = "pvti interface delete if-index <sw-ifindex>", + .function = pvti_interface_delete_command_fn, +}; + +static clib_error_t * +pvti_show_interface_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + pvti_if_t *pvti_if; + vec_foreach (pvti_if, pvti_main.if_pool) + { + int index = pvti_if - pvti_main.if_pool; + vlib_cli_output (vm, "%U", format_pvti_if, index); + }; + return 0; +} + +static clib_error_t * +pvti_show_tx_peers_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + pvti_per_thread_data_t *ptd; + int is_ip6; + for (is_ip6 = 0; is_ip6 <= 1; is_ip6++) + { + vec_foreach (ptd, pvti_main.per_thread_data[is_ip6]) + { + vlib_cli_output (vm, "thread %d (%s)", + ptd - pvti_main.per_thread_data[is_ip6], + is_ip6 ? "IPv6" : "IPv4"); + pvti_tx_peer_t *peer; + vec_foreach (peer, ptd->tx_peers) + { + vlib_cli_output (vm, " %U", format_pvti_tx_peer_ptr, peer); + } + } + } + return 0; +} + +static clib_error_t * +pvti_show_rx_peers_command_fn (vlib_main_t *vm, unformat_input_t *input, + vlib_cli_command_t *cmd) +{ + pvti_per_thread_data_t *ptd; + int is_ip6; + for (is_ip6 = 0; is_ip6 <= 1; is_ip6++) + { + vec_foreach (ptd, pvti_main.per_thread_data[is_ip6]) + { + vlib_cli_output (vm, "thread %d (%s)", + ptd - pvti_main.per_thread_data[is_ip6], + is_ip6 ? "IPv6" : "IPv4"); + pvti_rx_peer_t *peer; + vec_foreach (peer, ptd->rx_peers) + { + vlib_cli_output (vm, " %U", format_pvti_rx_peer_ptr, peer); + } + } + } + return 0; +} + +VLIB_CLI_COMMAND (pvti_show_interface_command, static) = { + .path = "show pvti interface", + .short_help = "show pvti interface", + .function = pvti_show_interface_command_fn, +}; + +VLIB_CLI_COMMAND (pvti_show_tx_peers_command, static) = { + .path = "show pvti tx peers", + .short_help = "show pvti tx peers", + .function = pvti_show_tx_peers_command_fn, +}; + +VLIB_CLI_COMMAND (pvti_show_rx_peers_command, static) = { + .path = "show pvti rx peers", + .short_help = "show pvti rx peers", + .function = pvti_show_rx_peers_command_fn, +}; + +void pvti_api_init (); + +VNET_FEATURE_INIT (pvti4_bypass, static) = { + .arc_name = "ip4-unicast", + .node_name = "ip4-pvti-bypass", + .runs_before = 0, +}; + +VNET_FEATURE_INIT (pvti6_bypass, static) = { + .arc_name = "ip6-unicast", + .node_name = "ip6-pvti-bypass", + .runs_before = 0, +}; + +static clib_error_t * +pvti_early_config (vlib_main_t *vm, unformat_input_t *input) +{ + clib_warning ("early config pvti"); + u8 *runs_before = 0; + int rbi = 0; + if (vec_len (vnet_feat_pvti4_bypass.runs_before) == 0) + { + rbi = 0; + } + else + { + rbi = vec_len (vnet_feat_pvti4_bypass.runs_before) - 1; + } + vec_validate (vnet_feat_pvti4_bypass.runs_before, rbi); + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "runs-before %v", &runs_before)) + { + vec_add1 (runs_before, 0); + vnet_feat_pvti4_bypass.runs_before[rbi] = (char *) runs_before; + vec_add1 (vnet_feat_pvti4_bypass.runs_before, 0); + } + else + return clib_error_return (0, "unknown input"); + } + + return NULL; +} + +VLIB_EARLY_CONFIG_FUNCTION (pvti_early_config, "pvti"); + +static clib_error_t * +pvti_init (vlib_main_t *vm) +{ + pvti_main_t *pmp = &pvti_main; + clib_error_t *error = 0; + clib_warning ("pvti init"); + + pmp->vlib_main = vm; + pmp->vnet_main = vnet_get_main (); + pmp->is_initialized = 0; + + pvti_api_init (); + return error; +} + +VLIB_INIT_FUNCTION (pvti_init); + +VLIB_PLUGIN_REGISTER () = { + .version = VPP_BUILD_VER, + .description = "Packet Vector Tunnel Interface plugin", +}; diff --git a/src/plugins/pvti/pvti.h b/src/plugins/pvti/pvti.h new file mode 100644 index 00000000000..ac097c5ecca --- /dev/null +++ b/src/plugins/pvti/pvti.h @@ -0,0 +1,257 @@ +/* + * pvti.h - skeleton vpp engine plug-in header file + * + * Copyright (c) 2024 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_pvti_h__ +#define __included_pvti_h__ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> + +#define VPP_MAX_THREADS (1 << 8) + +#define MAX_RX_STREAMS 256 + +#define PVTI_ALIGN_BYTES 9 + +typedef CLIB_PACKED (struct { + u32 seq; + u8 stream_index; // set to the cpu# on the sending side + u8 chunk_count; + u8 reass_chunk_count; // number of chunks in the front that are related to + // previously started buffer + // mandatory_flags_mask highlights which of the flags cause packet drop if + // not understood, and which of them can be just ignored. + u8 mandatory_flags_mask; + u8 flags_value; + u8 pad_bytes; + u8 pad[0]; +}) pvti_packet_header_t; + +typedef CLIB_PACKED (struct { + ip4_header_t ip4; + udp_header_t udp; + // not part of encap header pvti_packet_header_t pv; +}) pvti_ip4_encap_header_t; + +typedef CLIB_PACKED (struct { + ip6_header_t ip6; + udp_header_t udp; + // not part of encap header pvti_packet_header_t pv; +}) pvti_ip6_encap_header_t; + +typedef CLIB_PACKED (struct { + u16 total_chunk_length; + // More fragments: this chunk is not the last block fragment +#define CHUNK_FLAGS_MF (1 << 0) + // More blocks: this block has chained blocks that follow +#define CHUNK_FLAGS_MB (1 << 1) + u16 _pad0; + u32 _pad1; + u8 chunk_data[0]; +}) pvti_chunk_header_t; + +typedef struct +{ + // a buffer being built from the smaller packets + u32 bi0; + + // how big can this buffer grow + u32 bi0_max_current_length; + + // how many chunks are already in the buffer + u8 chunk_count; + // leading reassembly chunk count + u8 reass_chunk_count; + + u32 current_tx_seq; +} pvti_per_tx_stream_data_t; + +typedef struct +{ + /* The seq# that we last processed */ + u32 last_rx_seq; + + // a current buffer that is being reassembled + u32 rx_bi0; + // The root buffer, most of the times == rx_bi0 except in the case of chained + // buffers. + u32 rx_bi0_first; + + // Next index for dispatch when the reassembly is done + u16 rx_next0; + // expected totall inner length for the packet + u16 rx_expected_inner_length; + u16 rx_received_inner_length; + +} pvti_per_rx_stream_data_t; + +typedef struct +{ + ip_address_t local_ip; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + u16 underlay_mtu; + u32 underlay_fib_index; + + u32 pvti_if_index; + bool deleted; + bool is_bo0_traced; + + u32 bo0_max_current_length; + + u8 chunk_count; + u8 reass_chunk_count; + u32 current_tx_seq; + vlib_buffer_t *bo0; + +} pvti_tx_peer_t; + +typedef struct +{ + ip_address_t local_ip; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + + pvti_per_rx_stream_data_t rx_streams[MAX_RX_STREAMS]; + + u32 pvti_if_index; + bool deleted; +} pvti_rx_peer_t; + +typedef struct +{ + /* pool of destination-based structures which are used to build the packets + */ + pvti_tx_peer_t *tx_peers; + + /* vector of buffers to send */ + u32 *pending_tx_buffers; + u16 *pending_tx_nexts; + /* pool of source-based structures for the remote peers' data tracking + */ + pvti_rx_peer_t *rx_peers; + + /* vector of buffers being decapsulated */ + u32 *pending_rx_buffers; + u16 *pending_rx_nexts; + +} pvti_per_thread_data_t; + +typedef struct +{ + ip_address_t local_ip; + ip_address_t remote_ip; + u16 remote_port; + u16 local_port; + u16 underlay_mtu; + u32 underlay_fib_index; + bool peer_address_from_payload; + u64 created_at; + + u32 sw_if_index; + u32 hw_if_index; + + // per-stream data for TX + pvti_per_tx_stream_data_t tx_streams[256]; + pvti_per_rx_stream_data_t rx_streams[256]; + +} pvti_if_t; + +typedef struct +{ + /* API message ID base */ + u16 msg_id_base; + + /* have we initialized the data structures ? */ + bool is_initialized; + + /* interface pool */ + pvti_if_t *if_pool; + + /* if_index in the pool above by sw_if_index */ + index_t *if_index_by_sw_if_index; + + /* indices by port */ + index_t **if_indices_by_port; + + /* per-thread data, ip4[0] and ip6[1] */ + pvti_per_thread_data_t *per_thread_data[2]; + + /* on/off switch for the periodic function */ + u8 periodic_timer_enabled; + /* Node index, non-zero if the periodic process has been created */ + u32 periodic_node_index; + + /* graph node state */ + uword *bm_ip4_bypass_enabled_by_sw_if; + uword *bm_ip6_bypass_enabled_by_sw_if; + + /* convenience */ + vlib_main_t *vlib_main; + vnet_main_t *vnet_main; + ethernet_main_t *ethernet_main; +} pvti_main_t; + +extern pvti_main_t pvti_main; + +extern vlib_node_registration_t pvti_node; +extern vlib_node_registration_t pvti4_input_node; +extern vlib_node_registration_t pvti4_output_node; +extern vlib_node_registration_t pvti6_input_node; +extern vlib_node_registration_t pvti6_output_node; +extern vlib_node_registration_t pvti_periodic_node; + +always_inline u8 +pvti_get_stream_index (int is_ip6) +{ + u32 thread_index = vlib_get_thread_index (); + + ASSERT ((thread_index & 0xffffff80) == 0); + + u8 stream_index = (thread_index & 0x7f) | (is_ip6 ? 0x80 : 0); + return stream_index; +} + +/* attempt to get a new buffer */ +always_inline u32 +pvti_get_new_buffer (vlib_main_t *vm) +{ + u32 bi0 = INDEX_INVALID; + if (vlib_buffer_alloc (vm, &bi0, 1) != 1) + { + return INDEX_INVALID; + } + vlib_buffer_t *b0 = vlib_get_buffer (vm, bi0); + b0->current_data = 0; + b0->current_length = 0; + return bi0; +} + +/* Periodic function events */ +#define PVTI_EVENT1 1 +#define PVTI_EVENT2 2 +#define PVTI_EVENT_PERIODIC_ENABLE_DISABLE 3 + +void pvti_create_periodic_process (pvti_main_t *); +void pvti_verify_initialized (pvti_main_t *pvm); + +#endif /* __included_pvti_h__ */ diff --git a/src/plugins/pvti/pvti_if.c b/src/plugins/pvti/pvti_if.c new file mode 100644 index 00000000000..4f83994a1a4 --- /dev/null +++ b/src/plugins/pvti/pvti_if.c @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2020 Cisco and/or its affiliates. + * Copyright (c) 2020 Doc.ai and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/adj/adj_midchain.h> +#include <vnet/udp/udp.h> + +#include <pvti/pvti.h> +#include <pvti/pvti_if.h> + +static u8 * +format_pvti_if_name (u8 *s, va_list *args) +{ + u32 dev_instance = va_arg (*args, u32); + // wg_if_t *wgi = wg_if_get (dev_instance); + return format (s, "pvti%d", dev_instance); +} + +u8 * +format_pvti_if (u8 *s, va_list *args) +{ + index_t pvtii = va_arg (*args, u32); + pvti_if_t *pvti_if = pvti_if_get (pvtii); + + s = format ( + s, "[%d] %U local:%U:%d remote:%U:%d underlay_mtu:%d underlay_fib_idx:%d", + pvtii, format_vnet_sw_if_index_name, vnet_get_main (), + pvti_if->sw_if_index, format_ip46_address, &pvti_if->local_ip, + IP46_TYPE_ANY, pvti_if->local_port, format_ip46_address, + &pvti_if->remote_ip, IP46_TYPE_ANY, pvti_if->remote_port, + pvti_if->underlay_mtu, pvti_if->underlay_fib_index); + + return (s); +} + +index_t +pvti_if_find_by_sw_if_index (u32 sw_if_index) +{ + if (vec_len (pvti_main.if_index_by_sw_if_index) <= sw_if_index) + return INDEX_INVALID; + u32 ti = pvti_main.if_index_by_sw_if_index[sw_if_index]; + if (ti == ~0) + return INDEX_INVALID; + + return (ti); +} + +index_t +pvti_if_find_by_remote_ip4_and_port (ip4_address_t *remote_ip4, + u16 remote_port) +{ + pvti_if_t *ifc; + pool_foreach (ifc, pvti_main.if_pool) + { + if ((ifc->remote_port == remote_port) && + (ifc->remote_ip.version == AF_IP4) && + ((ifc->remote_ip.ip.ip4.as_u32 == remote_ip4->as_u32) || + ifc->peer_address_from_payload)) + { + return (ifc - pvti_main.if_pool); + } + } + return INDEX_INVALID; +} + +index_t +pvti_if_find_by_remote_ip6_and_port (ip6_address_t *remote_ip6, + u16 remote_port) +{ + pvti_if_t *ifc; + pool_foreach (ifc, pvti_main.if_pool) + { + if ((ifc->remote_port == remote_port) && + (ifc->remote_ip.version == AF_IP6) && + ((0 == memcmp (&ifc->remote_ip.ip.ip6, remote_ip6, + sizeof (*remote_ip6))) || + ifc->peer_address_from_payload)) + { + return (ifc - pvti_main.if_pool); + } + } + return INDEX_INVALID; +} + +index_t +pvti_if_find_by_remote_ip_and_port (ip_address_t *remote_ip, u16 remote_port) +{ + pvti_if_t *ifc; + pool_foreach (ifc, pvti_main.if_pool) + { + if ((ifc->remote_port == remote_port) && + (ifc->peer_address_from_payload || + (0 == ip_address_cmp (remote_ip, &ifc->remote_ip)))) + { + return (ifc - pvti_main.if_pool); + } + } + return INDEX_INVALID; +} + +static void +pvti_add_tidx_by_port (index_t t_index, u16 port) +{ + pvti_main_t *pvm = &pvti_main; + vec_validate_init_empty (pvm->if_indices_by_port, port, NULL); + vec_add1 (pvm->if_indices_by_port[port], t_index); +} + +static void +pvti_del_tidx_by_port (index_t t_index, u16 port) +{ + pvti_main_t *pvm = &pvti_main; + index_t *ii; + if (!pvm->if_indices_by_port) + { + return; + } + if (port >= vec_len (pvm->if_indices_by_port)) + { + return; + } + if (vec_len (pvm->if_indices_by_port[port]) == 0) + { + ALWAYS_ASSERT (pvm->if_indices_by_port[port] > 0); + /* not reached */ + return; + } + + vec_foreach (ii, pvm->if_indices_by_port[port]) + { + if (*ii == t_index) + { + vec_del1 (pvm->if_indices_by_port[port], + pvm->if_indices_by_port[port] - ii); + break; + } + } +} + +static u32 +pvti_get_tunnel_count_by_port (u16 port) +{ + pvti_main_t *pvm = &pvti_main; + if (!pvm->if_indices_by_port) + { + return 0; + } + return vec_len (vec_elt (pvm->if_indices_by_port, port)); +} + +static clib_error_t * +pvti_if_admin_up_down (vnet_main_t *vnm, u32 hw_if_index, u32 flags) +{ + // vnet_hw_interface_t *hi; + u32 hw_flags; + + // hi = vnet_get_hw_interface (vnm, hw_if_index); + hw_flags = + (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP ? VNET_HW_INTERFACE_FLAG_LINK_UP : + 0); + vnet_hw_interface_set_flags (vnm, hw_if_index, hw_flags); + + return (NULL); +} + +void +pvti_if_update_adj (vnet_main_t *vnm, u32 sw_if_index, adj_index_t ai) +{ + + /* Convert any neighbour adjacency that has a next-hop reachable through + * the wg interface into a midchain. This is to avoid sending ARP/ND to + * resolve the next-hop address via the wg interface. Then, if one of the + * peers has matching prefix among allowed prefixes, the midchain will be + * updated to the corresponding one. + */ + adj_nbr_midchain_update_rewrite (ai, NULL, NULL, ADJ_FLAG_NONE, NULL); + + // wgii = wg_if_find_by_sw_if_index (sw_if_index); + // wg_if_peer_walk (wg_if_get (wgii), wg_peer_if_adj_change, &ai); +} + +VNET_DEVICE_CLASS (pvti_if_device_class) = { + .name = "Packet Vectorizer Tunnel", + .format_device_name = format_pvti_if_name, + .admin_up_down_function = pvti_if_admin_up_down, +}; + +VNET_HW_INTERFACE_CLASS (pvti_hw_interface_class) = { + .name = "PVTunnel", + .update_adjacency = pvti_if_update_adj, + .flags = VNET_HW_INTERFACE_CLASS_FLAG_P2P, + // .flags = VNET_HW_INTERFACE_CLASS_FLAG_NBMA, +}; + +int +pvti_if_create (ip_address_t *local_ip, u16 local_port, + ip_address_t *remote_ip, u16 remote_port, + pvti_peer_address_method_t peer_address_method, + u16 underlay_mtu, u32 underlay_fib_index, u32 *sw_if_indexp) +{ + vnet_main_t *vnm = vnet_get_main (); + pvti_main_t *pvm = &pvti_main; + u32 hw_if_index; + vnet_hw_interface_t *hi; + pvti_verify_initialized (pvm); + + pvti_if_t *pvti_if; + + ASSERT (sw_if_indexp); + + *sw_if_indexp = (u32) ~0; + + pool_get_zero (pvti_main.if_pool, pvti_if); + pvti_if->local_ip = *local_ip; + pvti_if->local_port = local_port; + pvti_if->remote_ip = *remote_ip; + if (peer_address_method == PVTI_PEER_ADDRESS_FROM_PAYLOAD) + { + pvti_if->peer_address_from_payload = 1; + } + pvti_if->remote_port = remote_port; + pvti_if->underlay_mtu = underlay_mtu; + pvti_if->underlay_fib_index = underlay_fib_index; + pvti_if->created_at = clib_cpu_time_now (); + + /* tunnel index (or instance) */ + u32 t_idx = pvti_if - pvti_main.if_pool; + + hw_if_index = + vnet_register_interface (vnm, pvti_if_device_class.index, t_idx, + pvti_hw_interface_class.index, t_idx); + + pvti_if->hw_if_index = hw_if_index; + + hi = vnet_get_hw_interface (vnm, hw_if_index); + pvti_if->sw_if_index = *sw_if_indexp = hi->sw_if_index; + + vec_validate_init_empty (pvm->if_index_by_sw_if_index, hi->sw_if_index, + INDEX_INVALID); + + vec_elt (pvm->if_index_by_sw_if_index, hi->sw_if_index) = t_idx; + pvti_if_t *pvti_if0 = pool_elt_at_index (pvti_main.if_pool, t_idx); + int i; + for (i = 0; i < 256; i++) + { + pvti_if0->tx_streams[i].bi0 = INDEX_INVALID; + pvti_if0->tx_streams[i].current_tx_seq = 42; + + pvti_if0->rx_streams[i].rx_bi0 = INDEX_INVALID; + pvti_if0->rx_streams[i].rx_bi0_first = INDEX_INVALID; + } + + /* + int is_ip6 = 0; + u32 encap_index = !is_ip6 ? + pvti4_output_node.index : pvti6_output_node.index; + vnet_set_interface_output_node (vnm, pvti_if->hw_if_index, encap_index); + */ + vnet_set_interface_l3_output_node (vnm->vlib_main, hi->sw_if_index, + (u8 *) "pvti4-output"); + + pvti_add_tidx_by_port (t_idx, local_port); + if (1 == pvti_get_tunnel_count_by_port (local_port)) + { + clib_warning ("Registering local port %d", local_port); + udp_register_dst_port (vlib_get_main (), local_port, + pvti4_input_node.index, UDP_IP4); + udp_register_dst_port (vlib_get_main (), local_port, + pvti6_input_node.index, UDP_IP6); + } + else + { + clib_warning ("Not registering the port"); + } + + vnet_hw_interface_set_flags (vnm, pvti_if->hw_if_index, + VNET_HW_INTERFACE_FLAG_LINK_UP); + + return 0; +} + +void +pvti_if_walk (pvti_if_walk_cb_t fn, void *data) +{ + index_t pvtii; + + pool_foreach_index (pvtii, pvti_main.if_pool) + { + if (WALK_STOP == fn (pvtii, data)) + break; + } +} + +int +pvti_if_delete (u32 sw_if_index) +{ + vnet_main_t *vnm = vnet_get_main (); + pvti_main_t *pvm = &pvti_main; + + if (pool_is_free_index (vnm->interface_main.sw_interfaces, sw_if_index)) + return VNET_API_ERROR_INVALID_SW_IF_INDEX; + + vnet_hw_interface_t *hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + if (hw == 0 || hw->dev_class_index != pvti_if_device_class.index) + return VNET_API_ERROR_INVALID_VALUE; + + pvti_if_t *ifc; + bool found = 0; + pool_foreach (ifc, pvm->if_pool) + { + if (ifc->sw_if_index == sw_if_index) + { + found = 1; + break; + } + } + if (!found) + { + return VNET_API_ERROR_INVALID_VALUE_2; + } + index_t tidx = ifc - pvm->if_pool; + + u16 local_port = ifc->local_port; + pvti_del_tidx_by_port (tidx, local_port); + pvm->if_index_by_sw_if_index[sw_if_index] = INDEX_INVALID; + + if (0 == pvti_get_tunnel_count_by_port (local_port)) + { + udp_unregister_dst_port (vlib_get_main (), local_port, 1); + udp_unregister_dst_port (vlib_get_main (), local_port, 0); + } + + vnet_reset_interface_l3_output_node (vnm->vlib_main, sw_if_index); + vnet_delete_hw_interface (vnm, hw->hw_if_index); + pool_put (pvti_main.if_pool, ifc); + + /* mark per-thread peers as deleted */ + pvti_per_thread_data_t *ptd; + + vec_foreach (ptd, pvm->per_thread_data[0]) + { + pvti_tx_peer_t *peer; + vec_foreach (peer, ptd->tx_peers) + { + if (tidx == peer->pvti_if_index) + { + peer->deleted = 1; + } + } + } + vec_foreach (ptd, pvm->per_thread_data[1]) + { + pvti_tx_peer_t *peer; + vec_foreach (peer, ptd->tx_peers) + { + if (tidx == peer->pvti_if_index) + { + peer->deleted = 1; + } + } + } + + return 0; +} diff --git a/src/plugins/pvti/pvti_if.h b/src/plugins/pvti/pvti_if.h new file mode 100644 index 00000000000..44bf22ce825 --- /dev/null +++ b/src/plugins/pvti/pvti_if.h @@ -0,0 +1,47 @@ +#ifndef PVTI_IF_H +#define PVTI_IF_H + +#include <vnet/interface_funcs.h> + +typedef enum +{ + PVTI_PEER_ADDRESS_FIXED = 0, + PVTI_PEER_ADDRESS_FROM_PAYLOAD +} pvti_peer_address_method_t; + +typedef walk_rc_t (*pvti_if_walk_cb_t) (index_t wgi, void *data); +void pvti_if_walk (pvti_if_walk_cb_t fn, void *data); + +int pvti_if_create (ip_address_t *local_ip, u16 local_port, + ip_address_t *remote_ip, u16 remote_port, + pvti_peer_address_method_t peer_address_method, + u16 underlay_mtu, u32 underlay_fib_index, + u32 *sw_if_indexp); +index_t pvti_if_find_by_sw_if_index (u32 sw_if_index); +index_t pvti_if_find_by_remote_ip4_and_port (ip4_address_t *remote_ip4, + u16 remote_port); +index_t pvti_if_find_by_remote_ip6_and_port (ip6_address_t *remote_ip4, + u16 remote_port); + +index_t pvti_if_find_by_remote_ip_and_port (ip_address_t *remote_ip, + u16 remote_port); + +int pvti_if_delete (u32 sw_if_index); + +u8 *format_pvti_if (u8 *s, va_list *args); + +static_always_inline pvti_if_t * +pvti_if_get (index_t pvtii) +{ + if (INDEX_INVALID == pvtii) + return (NULL); + return (pool_elt_at_index (pvti_main.if_pool, pvtii)); +} + +static_always_inline index_t +pvti_if_get_index (pvti_if_t *pvti_if) +{ + return pvti_if - pvti_main.if_pool; +} + +#endif |