diff options
author | 2015-12-08 15:45:58 -0700 | |
---|---|---|
committer | 2015-12-08 15:47:27 -0700 | |
commit | cb9cadad578297ffd78fa8a33670bdf1ab669e7e (patch) | |
tree | 6ac2be912482cc7849a26f0ab845561c3d7f4e26 /vnet/vnet/ip | |
parent | fb0815d4ae4bb0fe27bd9313f34b45c8593b907e (diff) |
Initial commit of vpp code.v1.0.0
Change-Id: Ib246f1fbfce93274020ee93ce461e3d8bd8b9f17
Signed-off-by: Ed Warnicke <eaw@cisco.com>
Diffstat (limited to 'vnet/vnet/ip')
55 files changed, 28894 insertions, 0 deletions
diff --git a/vnet/vnet/ip/format.c b/vnet/vnet/ip/format.c new file mode 100644 index 00000000000..9dda4c5e10b --- /dev/null +++ b/vnet/vnet/ip/format.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip_format.c: ip generic (4 or 6) formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +/* Format IP protocol. */ +u8 * format_ip_protocol (u8 * s, va_list * args) +{ + ip_protocol_t protocol = va_arg (*args, ip_protocol_t); + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi = ip_get_protocol_info (im, protocol); + + if (pi) + return format (s, "%s", pi->name); + else + return format (s, "unknown %d", protocol); +} + +uword unformat_ip_protocol (unformat_input_t * input, va_list * args) +{ + u8 * result = va_arg (*args, u8 *); + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi; + int i; + + if (! unformat_user (input, unformat_vlib_number_by_name, + im->protocol_info_by_name, &i)) + return 0; + + pi = vec_elt_at_index (im->protocol_infos, i); + *result = pi->protocol; + return 1; +} + +u8 * format_tcp_udp_port (u8 * s, va_list * args) +{ + int port = va_arg (*args, int); + ip_main_t * im = &ip_main; + tcp_udp_port_info_t * pi; + + pi = ip_get_tcp_udp_port_info (im, port); + if (pi) + s = format (s, "%s", pi->name); + else + s = format (s, "%d", clib_net_to_host_u16 (port)); + + return s; +} + +uword unformat_tcp_udp_port (unformat_input_t * input, va_list * args) +{ + u16 * result = va_arg (*args, u16 *); + ip_main_t * im = &ip_main; + tcp_udp_port_info_t * pi; + u32 i, port; + + + if (unformat_user (input, unformat_vlib_number_by_name, + im->port_info_by_name, &i)) + { + pi = vec_elt_at_index (im->port_infos, i); + port = pi->port; + } + else if (unformat_user (input, unformat_vlib_number, &port) + && port < (1 << 16)) + port = clib_host_to_net_u16 (port); + + else + return 0; + + *result = port; + return 1; +} + +uword unformat_ip46_address (unformat_input_t * input, va_list * args) +{ + ip46_address_t * a = va_arg (*args, ip46_address_t *); + u32 is_ip6 = va_arg (*args, u32); + if (is_ip6) + return unformat_user (input, unformat_ip6_address, &a->ip6); + else + return unformat_user (input, unformat_ip4_address, &a->ip4); +} diff --git a/vnet/vnet/ip/format.h b/vnet/vnet/ip/format.h new file mode 100644 index 00000000000..511a9346bf6 --- /dev/null +++ b/vnet/vnet/ip/format.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/format.h: ip 4 and/or 6 formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip_format_h +#define included_ip_format_h + +/* IP4 or IP6. */ + +format_function_t format_ip_protocol; +unformat_function_t unformat_ip_protocol; + +format_function_t format_tcp_udp_port; +unformat_function_t unformat_tcp_udp_port; + +format_function_t format_ip_adjacency; +format_function_t format_ip_adjacency_packet_data; + +unformat_function_t unformat_ip46_address; + +/* IP4 */ + +/* Parse an IP4 address %d.%d.%d.%d. */ +unformat_function_t unformat_ip4_address; + +/* Format an IP4 address. */ +format_function_t format_ip4_address; +format_function_t format_ip4_address_and_length; + +/* Parse an IP4 header. */ +unformat_function_t unformat_ip4_header; + +/* Format an IP4 header. */ +format_function_t format_ip4_header; + +/* Parse an IP packet matching pattern. */ +unformat_function_t unformat_ip4_match; + +unformat_function_t unformat_pg_ip4_header; + +/* IP6 */ +unformat_function_t unformat_ip6_address; +format_function_t format_ip6_address; +format_function_t format_ip6_address_and_length; +unformat_function_t unformat_ip6_header; +format_function_t format_ip6_header; +unformat_function_t unformat_pg_ip6_header; + +/* Format a TCP/UDP headers. */ +format_function_t format_tcp_header, format_udp_header; + +unformat_function_t unformat_pg_tcp_header, unformat_pg_udp_header; + +#endif /* included_ip_format_h */ diff --git a/vnet/vnet/ip/icmp4.c b/vnet/vnet/ip/icmp4.c new file mode 100644 index 00000000000..e21f3bf047b --- /dev/null +++ b/vnet/vnet/ip/icmp4.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/icmp4.c: ipv4 icmp + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vnet/ip/ip.h> +#include <vnet/pg/pg.h> + +static u8 * format_ip4_icmp_type_and_code (u8 * s, va_list * args) +{ + icmp4_type_t type = va_arg (*args, int); + u8 code = va_arg (*args, int); + char * t = 0; + +#define _(n,f) case n: t = #f; break; + + switch (type) + { + foreach_icmp4_type; + + default: + break; + } + +#undef _ + + if (! t) + return format (s, "unknown 0x%x", type); + + s = format (s, "%s", t); + + t = 0; + switch ((type << 8) | code) + { +#define _(a,n,f) case (ICMP4_##a << 8) | (n): t = #f; break; + + foreach_icmp4_code; + +#undef _ + } + + if (t) + s = format (s, " %s", t); + + return s; +} + +static u8 * format_ip4_icmp_header (u8 * s, va_list * args) +{ + icmp46_header_t * icmp = va_arg (*args, icmp46_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + + /* Nothing to do. */ + if (max_header_bytes < sizeof (icmp[0])) + return format (s, "ICMP header truncated"); + + s = format (s, "ICMP %U checksum 0x%x", + format_ip4_icmp_type_and_code, icmp->type, icmp->code, + clib_net_to_host_u16 (icmp->checksum)); + + return s; +} + +typedef struct { + u8 packet_data[64]; +} icmp_input_trace_t; + +static u8 * format_icmp_input_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + icmp_input_trace_t * t = va_arg (*va, icmp_input_trace_t *); + + s = format (s, "%U", + format_ip4_header, + t->packet_data, sizeof (t->packet_data)); + + return s; +} + +typedef enum { + ICMP4_ERROR_UNKNOWN_TYPE, + ICMP4_ERROR_ECHO_REPLIES_SENT, + ICMP4_ERROR_TTL_EXPIRE_RESP_SENT, + ICMP4_ERROR_TTL_EXPIRE_RESP_DROP, +} icmp_error_t; + +static char * icmp_error_strings[] = { + [ICMP4_ERROR_UNKNOWN_TYPE] = "unknown type", + [ICMP4_ERROR_ECHO_REPLIES_SENT] = "echo replies sent", + [ICMP4_ERROR_TTL_EXPIRE_RESP_SENT] = "TTL time exceeded response sent", + [ICMP4_ERROR_TTL_EXPIRE_RESP_DROP] = "TTL time exceeded response dropped", +}; + +typedef enum { + ICMP_INPUT_NEXT_ERROR, + ICMP_INPUT_N_NEXT, +} icmp_input_next_t; + +typedef struct { + uword * type_and_code_by_name; + + uword * type_by_name; + + /* Vector dispatch table indexed by [icmp type]. */ + u8 ip4_input_next_index_by_type[256]; +} icmp4_main_t; + +icmp4_main_t icmp4_main; + +static uword +ip4_icmp_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + icmp4_main_t * im = &icmp4_main; + uword n_packets = frame->n_vectors; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next; + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + next = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (icmp_input_trace_t)); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + icmp46_header_t * icmp0; + icmp4_type_t type0; + u32 bi0, next0; + + if (PREDICT_TRUE (n_left_from > 2)) + { + vlib_prefetch_buffer_with_index (vm, from[2], LOAD); + p0 = vlib_get_buffer (vm, from[1]); + ip0 = vlib_buffer_get_current (p0); + CLIB_PREFETCH(ip0, CLIB_CACHE_LINE_BYTES, LOAD); + } + + bi0 = to_next[0] = from[0]; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (p0); + icmp0 = ip4_next_header (ip0); + type0 = icmp0->type; + next0 = im->ip4_input_next_index_by_type[type0]; + + p0->error = node->errors[ICMP4_ERROR_UNKNOWN_TYPE]; + if (PREDICT_FALSE (next0 != next)) + { + vlib_put_next_frame (vm, node, next, n_left_to_next + 1); + next = next0; + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + } + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip4_icmp_input_node,static) = { + .function = ip4_icmp_input, + .name = "ip4-icmp-input", + + .vector_size = sizeof (u32), + + .format_trace = format_icmp_input_trace, + + .n_errors = ARRAY_LEN (icmp_error_strings), + .error_strings = icmp_error_strings, + + .n_next_nodes = 1, + .next_nodes = { + [ICMP_INPUT_NEXT_ERROR] = "error-punt", + }, +}; + +static uword +ip4_icmp_echo_request (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + uword n_packets = frame->n_vectors; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next; + ip4_main_t * i4m = &ip4_main; + u16 * fragment_ids, * fid; + u8 host_config_ttl = i4m->host_config.ttl; + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + next = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (icmp_input_trace_t)); + + /* Get random fragment IDs for replies. */ + fid = fragment_ids = clib_random_buffer_get_data (&vm->random_buffer, + n_packets * sizeof (fragment_ids[0])); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + + while (n_left_from > 2 && n_left_to_next > 2) + { + vlib_buffer_t * p0, * p1; + ip4_header_t * ip0, * ip1; + icmp46_header_t * icmp0, * icmp1; + u32 bi0, src0, dst0; + u32 bi1, src1, dst1; + ip_csum_t sum0, sum1; + + bi0 = to_next[0] = from[0]; + bi1 = to_next[1] = from[1]; + + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, bi0); + p1 = vlib_get_buffer (vm, bi1); + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + icmp0 = ip4_next_header (ip0); + icmp1 = ip4_next_header (ip1); + + vnet_buffer (p0)->sw_if_index[VLIB_RX] = vnet_main.local_interface_sw_if_index; + vnet_buffer (p1)->sw_if_index[VLIB_RX] = vnet_main.local_interface_sw_if_index; + + /* Update ICMP checksum. */ + sum0 = icmp0->checksum; + sum1 = icmp1->checksum; + + ASSERT (icmp0->type == ICMP4_echo_request); + ASSERT (icmp1->type == ICMP4_echo_request); + sum0 = ip_csum_update (sum0, ICMP4_echo_request, ICMP4_echo_reply, + icmp46_header_t, type); + sum1 = ip_csum_update (sum1, ICMP4_echo_request, ICMP4_echo_reply, + icmp46_header_t, type); + icmp0->type = ICMP4_echo_reply; + icmp1->type = ICMP4_echo_reply; + + icmp0->checksum = ip_csum_fold (sum0); + icmp1->checksum = ip_csum_fold (sum1); + + src0 = ip0->src_address.data_u32; + src1 = ip1->src_address.data_u32; + dst0 = ip0->dst_address.data_u32; + dst1 = ip1->dst_address.data_u32; + + /* Swap source and destination address. + Does not change checksum. */ + ip0->src_address.data_u32 = dst0; + ip1->src_address.data_u32 = dst1; + ip0->dst_address.data_u32 = src0; + ip1->dst_address.data_u32 = src1; + + /* Update IP checksum. */ + sum0 = ip0->checksum; + sum1 = ip1->checksum; + + sum0 = ip_csum_update (sum0, ip0->ttl, host_config_ttl, + ip4_header_t, ttl); + sum1 = ip_csum_update (sum1, ip1->ttl, host_config_ttl, + ip4_header_t, ttl); + ip0->ttl = host_config_ttl; + ip1->ttl = host_config_ttl; + + /* New fragment id. */ + sum0 = ip_csum_update (sum0, ip0->fragment_id, fid[0], + ip4_header_t, fragment_id); + sum1 = ip_csum_update (sum1, ip1->fragment_id, fid[1], + ip4_header_t, fragment_id); + ip0->fragment_id = fid[0]; + ip1->fragment_id = fid[1]; + fid += 2; + + ip0->checksum = ip_csum_fold (sum0); + ip1->checksum = ip_csum_fold (sum1); + + ASSERT (ip0->checksum == ip4_header_checksum (ip0)); + ASSERT (ip1->checksum == ip4_header_checksum (ip1)); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + icmp46_header_t * icmp0; + u32 bi0, src0, dst0; + ip_csum_t sum0; + + bi0 = to_next[0] = from[0]; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (p0); + icmp0 = ip4_next_header (ip0); + + vnet_buffer (p0)->sw_if_index[VLIB_RX] = vnet_main.local_interface_sw_if_index; + + /* Update ICMP checksum. */ + sum0 = icmp0->checksum; + + ASSERT (icmp0->type == ICMP4_echo_request); + sum0 = ip_csum_update (sum0, ICMP4_echo_request, ICMP4_echo_reply, + icmp46_header_t, type); + icmp0->type = ICMP4_echo_reply; + icmp0->checksum = ip_csum_fold (sum0); + + src0 = ip0->src_address.data_u32; + dst0 = ip0->dst_address.data_u32; + ip0->src_address.data_u32 = dst0; + ip0->dst_address.data_u32 = src0; + + /* Update IP checksum. */ + sum0 = ip0->checksum; + + sum0 = ip_csum_update (sum0, ip0->ttl, host_config_ttl, + ip4_header_t, ttl); + ip0->ttl = host_config_ttl; + + sum0 = ip_csum_update (sum0, ip0->fragment_id, fid[0], + ip4_header_t, fragment_id); + ip0->fragment_id = fid[0]; + fid += 1; + + ip0->checksum = ip_csum_fold (sum0); + + ASSERT (ip0->checksum == ip4_header_checksum (ip0)); + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + vlib_error_count (vm, ip4_icmp_input_node.index, + ICMP4_ERROR_ECHO_REPLIES_SENT, + frame->n_vectors); + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip4_icmp_echo_request_node,static) = { + .function = ip4_icmp_echo_request, + .name = "ip4-icmp-echo-request", + + .vector_size = sizeof (u32), + + .format_trace = format_icmp_input_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "ip4-rewrite-local", + }, +}; + +typedef enum { + ICMP4_TTL_EXPIRE_NEXT_DROP, + ICMP4_TTL_EXPIRE_NEXT_LOOKUP, + ICMP4_TTL_EXPIRE_N_NEXT, +} icmp_ttl_expire_next_t; + +static uword +ip4_icmp_ttl_expire (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 * from, * to_next; + uword n_left_from, n_left_to_next; + icmp_ttl_expire_next_t next_index; + ip4_main_t *im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + + from = vlib_frame_vector_args(frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, sizeof (icmp_input_trace_t)); + + while (n_left_from > 0) + { + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 pi0 = from[0]; + u32 next0 = ICMP4_TTL_EXPIRE_NEXT_LOOKUP; + u8 error0 = ICMP4_ERROR_TTL_EXPIRE_RESP_SENT; + u32 len0, new_len0; + vlib_buffer_t * p0; + ip4_header_t * ip0, * out_ip0; + icmp46_header_t * icmp0; + ip_csum_t sum; + u32 sw_if_index0, if_add_index0; + + /* Speculatively enqueue p0 to the current next frame */ + to_next[0] = pi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer(vm, pi0); + ip0 = vlib_buffer_get_current(p0); + len0 = vlib_buffer_length_in_chain (vm, p0); + sw_if_index0 = vnet_buffer(p0)->sw_if_index[VLIB_RX]; + + /* Cut payload to just IP header plus first 8 bytes */ + new_len0 = (ip0->ip_version_and_header_length &0xf)*4 + 8; + if (len0 > new_len0) + { + p0->current_length = new_len0; /* should fit in 1st buffer */ + if (PREDICT_FALSE(p0->total_length_not_including_first_buffer)) + { /* clear current_length of all other buffers in chain */ + vlib_buffer_t *b = p0; + p0->total_length_not_including_first_buffer = 0; + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, b->next_buffer); + b->current_length = 0; + } + } + } + + /* Add IP header and ICMP header including a 4 byte unused field */ + vlib_buffer_advance(p0, + -sizeof(ip4_header_t)-sizeof(icmp46_header_t)-4); + out_ip0 = vlib_buffer_get_current(p0); + icmp0 = (icmp46_header_t *) &out_ip0[1]; + + /* Fill ip header fields */ + out_ip0->ip_version_and_header_length = 0x45; + out_ip0->tos = 0; + out_ip0->length = clib_host_to_net_u16(p0->current_length); + out_ip0->fragment_id = 0; + out_ip0->ttl = 0xff; + out_ip0->protocol = IP_PROTOCOL_ICMP; + out_ip0->dst_address = ip0->src_address; + if_add_index0 = + lm->if_address_pool_index_by_sw_if_index[sw_if_index0]; + if (PREDICT_TRUE(if_add_index0 != ~0)) + { + ip_interface_address_t *if_add = + pool_elt_at_index(lm->if_address_pool, if_add_index0); + ip4_address_t *if_ip = + ip_interface_address_get_address(lm, if_add); + out_ip0->src_address = *if_ip; + vlib_error_count (vm, node->node_index, error0, 1); + } + else /* interface has no IP4 address - should not happen */ + { + next0 = ICMP4_TTL_EXPIRE_NEXT_DROP; + error0 = ICMP4_ERROR_TTL_EXPIRE_RESP_DROP; + } + out_ip0->checksum = ip4_header_checksum(out_ip0); + + /* Fill icmp header fields */ + icmp0->type = ICMP4_time_exceeded; + icmp0->code = ICMP4_time_exceeded_ttl_exceeded_in_transit; + icmp0->checksum = 0; + sum = ip_incremental_checksum( + 0, icmp0, p0->current_length - sizeof(ip4_header_t)); + icmp0->checksum = ~ip_csum_fold(sum); + + /* Update error status */ + p0->error = node->errors[error0]; + + /* Verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1(vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip4_icmp_ttl_expire_node) = { + .function = ip4_icmp_ttl_expire, + .name = "ip4-icmp-ttl-expire", + .vector_size = sizeof (u32), + + .n_errors = ARRAY_LEN (icmp_error_strings), + .error_strings = icmp_error_strings, + + .n_next_nodes = ICMP4_TTL_EXPIRE_N_NEXT, + .next_nodes = { + [ICMP4_TTL_EXPIRE_NEXT_DROP] = "error-drop", + [ICMP4_TTL_EXPIRE_NEXT_LOOKUP] = "ip4-lookup", + }, + + .format_trace = format_icmp_input_trace, +}; + + +static uword unformat_icmp_type_and_code (unformat_input_t * input, va_list * args) +{ + icmp46_header_t * h = va_arg (*args, icmp46_header_t *); + icmp4_main_t * cm = &icmp4_main; + u32 i; + + if (unformat_user (input, unformat_vlib_number_by_name, + cm->type_and_code_by_name, &i)) + { + h->type = (i >> 8) & 0xff; + h->code = (i >> 0) & 0xff; + } + else if (unformat_user (input, unformat_vlib_number_by_name, + cm->type_by_name, &i)) + { + h->type = i; + h->code = 0; + } + else + return 0; + + return 1; +} + +static void +icmp4_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, + u32 n_packets) +{ + vlib_main_t * vm = pg->vlib_main; + u32 ip_offset, icmp_offset; + + icmp_offset = g->start_byte_offset; + ip_offset = (g-1)->start_byte_offset; + + while (n_packets >= 1) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + icmp46_header_t * icmp0; + u32 len0; + + p0 = vlib_get_buffer (vm, packets[0]); + n_packets -= 1; + packets += 1; + + ASSERT (p0->current_data == 0); + ip0 = (void *) (p0->data + ip_offset); + icmp0 = (void *) (p0->data + icmp_offset); + len0 = clib_net_to_host_u16 (ip0->length) - ip4_header_bytes (ip0); + icmp0->checksum = ~ ip_csum_fold (ip_incremental_checksum (0, icmp0, len0)); + } +} + +typedef struct { + pg_edit_t type, code; + pg_edit_t checksum; +} pg_icmp46_header_t; + +always_inline void +pg_icmp_header_init (pg_icmp46_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, icmp46_header_t, f); + _ (type); + _ (code); + _ (checksum); +#undef _ +} + +static uword +unformat_pg_icmp_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t * s = va_arg (*args, pg_stream_t *); + pg_icmp46_header_t * p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (icmp46_header_t), + &group_index); + pg_icmp_header_init (p); + + p->checksum.type = PG_EDIT_UNSPECIFIED; + + { + icmp46_header_t tmp; + + if (! unformat (input, "ICMP %U", unformat_icmp_type_and_code, &tmp)) + goto error; + + pg_edit_set_fixed (&p->type, tmp.type); + pg_edit_set_fixed (&p->code, tmp.code); + } + + /* Parse options. */ + while (1) + { + if (unformat (input, "checksum %U", + unformat_pg_edit, + unformat_pg_number, &p->checksum)) + ; + + /* Can't parse input: try next protocol level. */ + else + break; + } + + if (! unformat_user (input, unformat_pg_payload, s)) + goto error; + + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t * g = pg_stream_get_group (s, group_index); + g->edit_function = icmp4_pg_edit_function; + g->edit_function_opaque = 0; + } + + return 1; + + error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + +void ip4_icmp_register_type (vlib_main_t * vm, icmp4_type_t type, + u32 node_index) +{ + icmp4_main_t * im = &icmp4_main; + + ASSERT (type < ARRAY_LEN (im->ip4_input_next_index_by_type)); + im->ip4_input_next_index_by_type[type] + = vlib_node_add_next (vm, ip4_icmp_input_node.index, node_index); +} + +static clib_error_t * +icmp4_init (vlib_main_t * vm) +{ + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi; + icmp4_main_t * cm = &icmp4_main; + clib_error_t * error; + + error = vlib_call_init_function (vm, ip_main_init); + + if (error) + return error; + + pi = ip_get_protocol_info (im, IP_PROTOCOL_ICMP); + pi->format_header = format_ip4_icmp_header; + pi->unformat_pg_edit = unformat_pg_icmp_header; + + cm->type_by_name = hash_create_string (0, sizeof (uword)); +#define _(n,t) hash_set_mem (cm->type_by_name, #t, (n)); + foreach_icmp4_type; +#undef _ + + cm->type_and_code_by_name = hash_create_string (0, sizeof (uword)); +#define _(a,n,t) hash_set_mem (cm->type_by_name, #t, (n) | (ICMP4_##a << 8)); + foreach_icmp4_code; +#undef _ + + memset (cm->ip4_input_next_index_by_type, + ICMP_INPUT_NEXT_ERROR, + sizeof (cm->ip4_input_next_index_by_type)); + + ip4_icmp_register_type (vm, ICMP4_echo_request, ip4_icmp_echo_request_node.index); + + return 0; +} + +VLIB_INIT_FUNCTION (icmp4_init); diff --git a/vnet/vnet/ip/icmp46_packet.h b/vnet/vnet/ip/icmp46_packet.h new file mode 100644 index 00000000000..fa3fed4d081 --- /dev/null +++ b/vnet/vnet/ip/icmp46_packet.h @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * icmp46_packet.h: ip4/ip6 icmp packet format + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vnet_icmp46_packet_h +#define included_vnet_icmp46_packet_h + +#include <vnet/ethernet/packet.h> +#include <vnet/ip/ip6_packet.h> + +#define foreach_icmp4_type \ + _ (0, echo_reply) \ + _ (3, destination_unreachable) \ + _ (4, source_quench) \ + _ (5, redirect) \ + _ (6, alternate_host_address) \ + _ (8, echo_request) \ + _ (9, router_advertisement) \ + _ (10, router_solicitation) \ + _ (11, time_exceeded) \ + _ (12, parameter_problem) \ + _ (13, timestamp_request) \ + _ (14, timestamp_reply) \ + _ (15, information_request) \ + _ (16, information_reply) \ + _ (17, address_mask_request) \ + _ (18, address_mask_reply) \ + _ (30, traceroute) \ + _ (31, datagram_conversion_error) \ + _ (32, mobile_host_redirect) \ + _ (33, ip6_where_are_you) \ + _ (34, ip6_i_am_here) \ + _ (35, mobile_registration_request) \ + _ (36, mobile_registration_reply) \ + _ (37, domain_name_request) \ + _ (38, domain_name_reply) \ + _ (39, skip) \ + _ (40, photuris) + +#define icmp_no_code 0 + +#define foreach_icmp4_code \ + _ (destination_unreachable, 0, destination_unreachable_net) \ + _ (destination_unreachable, 1, destination_unreachable_host) \ + _ (destination_unreachable, 2, protocol_unreachable) \ + _ (destination_unreachable, 3, port_unreachable) \ + _ (destination_unreachable, 4, fragmentation_needed_and_dont_fragment_set) \ + _ (destination_unreachable, 5, source_route_failed) \ + _ (destination_unreachable, 6, destination_network_unknown) \ + _ (destination_unreachable, 7, destination_host_unknown) \ + _ (destination_unreachable, 8, source_host_isolated) \ + _ (destination_unreachable, 9, network_administratively_prohibited) \ + _ (destination_unreachable, 10, host_administratively_prohibited) \ + _ (destination_unreachable, 11, network_unreachable_for_type_of_service) \ + _ (destination_unreachable, 12, host_unreachable_for_type_of_service) \ + _ (destination_unreachable, 13, communication_administratively_prohibited) \ + _ (destination_unreachable, 14, host_precedence_violation) \ + _ (destination_unreachable, 15, precedence_cutoff_in_effect) \ + _ (redirect, 0, network_redirect) \ + _ (redirect, 1, host_redirect) \ + _ (redirect, 2, type_of_service_and_network_redirect) \ + _ (redirect, 3, type_of_service_and_host_redirect) \ + _ (router_advertisement, 0, normal_router_advertisement) \ + _ (router_advertisement, 16, does_not_route_common_traffic) \ + _ (time_exceeded, 0, ttl_exceeded_in_transit) \ + _ (time_exceeded, 1, fragment_reassembly_time_exceeded) \ + _ (parameter_problem, 0, pointer_indicates_error) \ + _ (parameter_problem, 1, missing_required_option) \ + _ (parameter_problem, 2, bad_length) + +/* ICMPv6 */ +#define foreach_icmp6_type \ + _ (1, destination_unreachable) \ + _ (2, packet_too_big) \ + _ (3, time_exceeded) \ + _ (4, parameter_problem) \ + _ (128, echo_request) \ + _ (129, echo_reply) \ + _ (130, multicast_listener_request) \ + _ (131, multicast_listener_report) \ + _ (132, multicast_listener_done) \ + _ (133, router_solicitation) \ + _ (134, router_advertisement) \ + _ (135, neighbor_solicitation) \ + _ (136, neighbor_advertisement) \ + _ (137, redirect) \ + _ (138, router_renumbering) \ + _ (139, node_information_request) \ + _ (140, node_information_response) \ + _ (141, inverse_neighbor_solicitation) \ + _ (142, inverse_neighbor_advertisement) \ + _ (143, multicast_listener_report_v2) \ + _ (144, home_agent_address_discovery_request) \ + _ (145, home_agent_address_discovery_reply) \ + _ (146, mobile_prefix_solicitation) \ + _ (147, mobile_prefix_advertisement) \ + _ (148, certification_path_solicitation) \ + _ (149, certification_path_advertisement) \ + _ (151, multicast_router_advertisement) \ + _ (152, multicast_router_solicitation) \ + _ (153, multicast_router_termination) \ + _ (154, fmipv6_messages) + +#define foreach_icmp6_code \ + _ (destination_unreachable, 0, no_route_to_destination) \ + _ (destination_unreachable, 1, destination_administratively_prohibited) \ + _ (destination_unreachable, 2, beyond_scope_of_source_address) \ + _ (destination_unreachable, 3, address_unreachable) \ + _ (destination_unreachable, 4, port_unreachable) \ + _ (destination_unreachable, 5, source_address_failed_policy) \ + _ (destination_unreachable, 6, reject_route_to_destination) \ + _ (time_exceeded, 0, ttl_exceeded_in_transit) \ + _ (time_exceeded, 1, fragment_reassembly_time_exceeded) \ + _ (parameter_problem, 0, erroneous_header_field) \ + _ (parameter_problem, 1, unrecognized_next_header) \ + _ (parameter_problem, 2, unrecognized_option) \ + _ (router_renumbering, 0, command) \ + _ (router_renumbering, 1, result) \ + _ (node_information_request, 0, data_contains_ip6_address) \ + _ (node_information_request, 1, data_contains_name) \ + _ (node_information_request, 2, data_contains_ip4_address) \ + _ (node_information_response, 0, success) \ + _ (node_information_response, 1, failed) \ + _ (node_information_response, 2, unknown_request) + +typedef enum { +#define _(n,f) ICMP4_##f = n, + foreach_icmp4_type +#undef _ +} icmp4_type_t; + +typedef enum { +#define _(t,n,f) ICMP4_##t##_##f = n, + foreach_icmp4_code +#undef _ +} icmp4_code_t; + +typedef enum { +#define _(n,f) ICMP6_##f = n, + foreach_icmp6_type +#undef _ +} icmp6_type_t; + +typedef enum { +#define _(t,n,f) ICMP6_##t##_##f = n, + foreach_icmp6_code +#undef _ +} icmp6_code_t; + +typedef CLIB_PACKED (struct { + u8 type; + + u8 code; + + /* IP checksum of icmp header plus data which follows. */ + u16 checksum; +}) icmp46_header_t; + +/* ip6 neighbor discovery */ +#define foreach_icmp6_neighbor_discovery_option \ + _ (1, source_link_layer_address) \ + _ (2, target_link_layer_address) \ + _ (3, prefix_information) \ + _ (4, redirected_header) \ + _ (5, mtu) \ + _ (6, nbma_shortcut_limit) \ + _ (7, advertisement_interval) \ + _ (8, home_agent_information) \ + _ (9, source_address_list) \ + _ (10, target_address_list) \ + _ (11, cryptographically_generated_address) \ + _ (12, rsa_signature) \ + _ (13, timestamp) \ + _ (14, nonce) \ + _ (15, trust_anchor) \ + _ (16, certificate) \ + _ (17, ip_address_and_prefix) \ + _ (18, new_router_prefix_information) \ + _ (19, mobile_link_layer_address) \ + _ (20, neighbor_advertisement_acknowledgment) \ + _ (23, map) \ + _ (24, route_information) \ + _ (25, recursive_dns_server) \ + _ (26, ra_flags_extension) \ + _ (27, handover_key_request) \ + _ (28, handover_key_reply) \ + _ (29, handover_assist_information) \ + _ (30, mobile_node_identifier) \ + _ (31, dns_search_list) \ + _ (138, card_request) \ + _ (139, card_reply) + +typedef enum icmp6_neighbor_discovery_option_type { +#define _(n,f) ICMP6_NEIGHBOR_DISCOVERY_OPTION_##f = n, + foreach_icmp6_neighbor_discovery_option +#undef _ +} icmp6_neighbor_discovery_option_type_t; + +typedef CLIB_PACKED (struct { + /* Option type. */ + u8 type; + + /* Length of this header plus option data in 8 byte units. */ + u8 n_data_u64s; + + /* Option data follows. */ + u8 data[0]; +}) icmp6_neighbor_discovery_option_header_t; + +typedef CLIB_PACKED (struct { + icmp6_neighbor_discovery_option_header_t header; + u8 dst_address_length; + u8 flags; +#define ICMP6_NEIGHBOR_DISCOVERY_PREFIX_INFORMATION_FLAG_ON_LINK (1 << 7) +#define ICMP6_NEIGHBOR_DISCOVERY_PREFIX_INFORMATION_AUTO (1 << 6) + u32 valid_time; + u32 preferred_time; + u32 unused; + ip6_address_t dst_address; +}) icmp6_neighbor_discovery_prefix_information_option_t; + +typedef CLIB_PACKED (struct { + u8 type; + u8 aux_data_len_u32s; + u16 num_sources; + ip6_address_t mcast_addr; + ip6_address_t source_addr[0]; +}) icmp6_multicast_address_record_t; + +typedef CLIB_PACKED (struct { + ip6_hop_by_hop_ext_t ext_hdr; + ip6_router_alert_option_t alert; + ip6_padN_option_t pad; + icmp46_header_t icmp; + u16 rsvd; + u16 num_addr_records; + icmp6_multicast_address_record_t records[0]; +}) icmp6_multicast_listener_report_header_t; + +typedef CLIB_PACKED (struct { + icmp6_neighbor_discovery_option_header_t header; + u8 reserved[6]; + /* IP6 header plus payload follows. */ + u8 data[0]; +}) icmp6_neighbor_discovery_redirected_header_option_t; + +typedef CLIB_PACKED (struct { + icmp6_neighbor_discovery_option_header_t header; + u16 unused; + u32 mtu; +}) icmp6_neighbor_discovery_mtu_option_t; + +typedef CLIB_PACKED (struct { + icmp6_neighbor_discovery_option_header_t header; + u8 ethernet_address[6]; +}) icmp6_neighbor_discovery_ethernet_link_layer_address_option_t; + +typedef CLIB_PACKED (struct { + icmp6_neighbor_discovery_option_header_t header; + u8 max_l2_address[6+8]; +}) icmp6_neighbor_discovery_max_link_layer_address_option_t; + +/* Generic neighbor discover header. Used for router solicitations, + etc. */ +typedef CLIB_PACKED (struct { + icmp46_header_t icmp; + + u32 reserved_must_be_zero; +}) icmp6_neighbor_discovery_header_t; + +/* Router advertisement packet formats. */ +typedef CLIB_PACKED (struct { + icmp46_header_t icmp; + + /* Current hop limit to use for outgoing packets. */ + u8 current_hop_limit; + + u8 flags; +#define ICMP6_ROUTER_DISCOVERY_FLAG_ADDRESS_CONFIG_VIA_DHCP (1 << 7) +#define ICMP6_ROUTER_DISCOVERY_FLAG_OTHER_CONFIG_VIA_DHCP (1 << 6) + + /* Zero means unspecified. */ + u16 router_lifetime_in_sec; + + /* Zero means unspecified. */ + u32 neighbor_reachable_time_in_msec; + + /* Zero means unspecified. */ + u32 time_in_msec_between_retransmitted_neighbor_solicitations; + + /* Options that may follow: source_link_layer_address, mtu, prefix_information. */ +}) icmp6_router_advertisement_header_t; + +/* Neighbor solicitation/advertisement header. */ +typedef CLIB_PACKED (struct { + icmp46_header_t icmp; + + /* Zero for solicitation; flags for advertisement. */ + u32 advertisement_flags; + /* Set when sent by a router. */ +#define ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_ROUTER (1 << 31) + /* Set when response to solicitation. */ +#define ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_SOLICITED (1 << 30) +#define ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_OVERRIDE (1 << 29) + + ip6_address_t target_address; + + /* Options that may follow: source_link_layer_address + (for solicitation) target_link_layer_address (for advertisement). */ +}) icmp6_neighbor_solicitation_or_advertisement_header_t; + +typedef CLIB_PACKED (struct { + icmp46_header_t icmp; + + u32 reserved_must_be_zero; + + /* Better next hop to use for given destination. */ + ip6_address_t better_next_hop_address; + + ip6_address_t dst_address; + + /* Options that may follow: target_link_layer_address, + redirected_header. */ +}) icmp6_redirect_header_t; + +/* Solicitation/advertisement packet format for ethernet. */ +typedef CLIB_PACKED (struct { + ip6_header_t ip; + + icmp6_neighbor_solicitation_or_advertisement_header_t neighbor; + + icmp6_neighbor_discovery_ethernet_link_layer_address_option_t link_layer_option; +}) icmp6_neighbor_solicitation_header_t; + +/* Router solicitation packet format for ethernet. */ +typedef CLIB_PACKED (struct { + ip6_header_t ip; + icmp6_neighbor_discovery_header_t neighbor; + icmp6_neighbor_discovery_ethernet_link_layer_address_option_t link_layer_option; +}) icmp6_router_solicitation_header_t; + +/* router advertisement packet format for ethernet. */ +typedef CLIB_PACKED (struct { + ip6_header_t ip; + icmp6_router_advertisement_header_t router; + icmp6_neighbor_discovery_ethernet_link_layer_address_option_t link_layer_option; + icmp6_neighbor_discovery_mtu_option_t mtu_option; + icmp6_neighbor_discovery_prefix_information_option_t prefix[0]; +}) icmp6_router_advertisement_packet_t; + +/* multicast listener report packet format for ethernet. */ +typedef CLIB_PACKED (struct { + ip6_header_t ip; + icmp6_multicast_listener_report_header_t report_hdr; +}) icmp6_multicast_listener_report_packet_t; + +#endif /* included_vnet_icmp46_packet_h */ diff --git a/vnet/vnet/ip/icmp6.c b/vnet/vnet/ip/icmp6.c new file mode 100644 index 00000000000..2d265d2b5b2 --- /dev/null +++ b/vnet/vnet/ip/icmp6.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/icmp6.c: ip6 icmp + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vnet/ip/ip.h> +#include <vnet/pg/pg.h> + +static u8 * format_ip6_icmp_type_and_code (u8 * s, va_list * args) +{ + icmp6_type_t type = va_arg (*args, int); + u8 code = va_arg (*args, int); + char * t = 0; + +#define _(n,f) case n: t = #f; break; + + switch (type) + { + foreach_icmp6_type; + + default: + break; + } + +#undef _ + + if (! t) + return format (s, "unknown 0x%x", type); + + s = format (s, "%s", t); + + t = 0; + switch ((type << 8) | code) + { +#define _(a,n,f) case (ICMP6_##a << 8) | (n): t = #f; break; + + foreach_icmp6_code; + +#undef _ + } + + if (t) + s = format (s, " %s", t); + + return s; +} + +static u8 * format_icmp6_header (u8 * s, va_list * args) +{ + icmp46_header_t * icmp = va_arg (*args, icmp46_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + + /* Nothing to do. */ + if (max_header_bytes < sizeof (icmp[0])) + return format (s, "ICMP header truncated"); + + s = format (s, "ICMP %U checksum 0x%x", + format_ip6_icmp_type_and_code, icmp->type, icmp->code, + clib_net_to_host_u16 (icmp->checksum)); + + if (max_header_bytes >= + sizeof(icmp6_neighbor_solicitation_or_advertisement_header_t) && + (icmp->type == ICMP6_neighbor_solicitation || + icmp->type == ICMP6_neighbor_advertisement)) + { + icmp6_neighbor_solicitation_or_advertisement_header_t *icmp6_nd = + (icmp6_neighbor_solicitation_or_advertisement_header_t *) icmp; + s = format (s, "\n target address %U", + format_ip6_address, &icmp6_nd->target_address); + } + + return s; +} + +u8 * format_icmp6_input_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + icmp6_input_trace_t * t = va_arg (*va, icmp6_input_trace_t *); + + s = format (s, "%U", + format_ip6_header, + t->packet_data, sizeof (t->packet_data)); + + return s; +} + +static char * icmp_error_strings[] = { +#define _(f,s) s, + foreach_icmp6_error +#undef _ +}; + +typedef enum { + ICMP_INPUT_NEXT_DROP, + ICMP_INPUT_N_NEXT, +} icmp_input_next_t; + +typedef struct { + uword * type_and_code_by_name; + + uword * type_by_name; + + /* Vector dispatch table indexed by [icmp type]. */ + u8 input_next_index_by_type[256]; + + /* Max valid code indexed by icmp type. */ + u8 max_valid_code_by_type[256]; + + /* hop_limit must be >= this value for this icmp type. */ + u8 min_valid_hop_limit_by_type[256]; + + u8 min_valid_length_by_type[256]; +} icmp6_main_t; + +icmp6_main_t icmp6_main; + +static uword +ip6_icmp_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + icmp6_main_t * im = &icmp6_main; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next_index; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (icmp6_input_trace_t)); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * b0; + ip6_header_t * ip0; + icmp46_header_t * icmp0; + icmp6_type_t type0; + u32 bi0, next0, error0, len0; + + bi0 = to_next[0] = from[0]; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (b0); + icmp0 = ip6_next_header (ip0); + type0 = icmp0->type; + + error0 = ICMP6_ERROR_NONE; + + next0 = im->input_next_index_by_type[type0]; + error0 = next0 == ICMP_INPUT_NEXT_DROP ? ICMP6_ERROR_UNKNOWN_TYPE : error0; + + /* Check code is valid for type. */ + error0 = icmp0->code > im->max_valid_code_by_type[type0] ? ICMP6_ERROR_INVALID_CODE_FOR_TYPE : error0; + + /* Checksum is already validated by ip6_local node so we don't need to check that. */ + + /* Check that hop limit == 255 for certain types. */ + error0 = ip0->hop_limit < im->min_valid_hop_limit_by_type[type0] ? ICMP6_ERROR_INVALID_HOP_LIMIT_FOR_TYPE : error0; + + len0 = clib_net_to_host_u16 (ip0->payload_length); + error0 = len0 < im->min_valid_length_by_type[type0] ? ICMP6_ERROR_LENGTH_TOO_SMALL_FOR_TYPE : error0; + + b0->error = node->errors[error0]; + + next0 = error0 != ICMP6_ERROR_NONE ? ICMP_INPUT_NEXT_DROP : next0; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip6_icmp_input_node) = { + .function = ip6_icmp_input, + .name = "ip6-icmp-input", + + .vector_size = sizeof (u32), + + .format_trace = format_icmp6_input_trace, + + .n_errors = ARRAY_LEN (icmp_error_strings), + .error_strings = icmp_error_strings, + + .n_next_nodes = 1, + .next_nodes = { + [ICMP_INPUT_NEXT_DROP] = "error-drop", + }, +}; + +typedef enum { + ICMP6_ECHO_REQUEST_NEXT_LOOKUP, + ICMP6_ECHO_REQUEST_NEXT_OUTPUT, + ICMP6_ECHO_REQUEST_N_NEXT, +} icmp6_echo_request_next_t; + +static uword +ip6_icmp_echo_request (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next_index; + ip6_main_t * im = &ip6_main; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (icmp6_input_trace_t)); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 2 && n_left_to_next > 2) + { + vlib_buffer_t * p0, * p1; + ip6_header_t * ip0, * ip1; + icmp46_header_t * icmp0, * icmp1; + ip6_address_t tmp0, tmp1; + ip_csum_t sum0, sum1; + u32 bi0, bi1; + u32 fib_index0, fib_index1; + u32 next0 = ICMP6_ECHO_REQUEST_NEXT_LOOKUP; + u32 next1 = ICMP6_ECHO_REQUEST_NEXT_LOOKUP; + + bi0 = to_next[0] = from[0]; + bi1 = to_next[1] = from[1]; + + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, bi0); + p1 = vlib_get_buffer (vm, bi1); + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + icmp0 = ip6_next_header (ip0); + icmp1 = ip6_next_header (ip1); + + /* Check icmp type to echo reply and update icmp checksum. */ + sum0 = icmp0->checksum; + sum1 = icmp1->checksum; + + ASSERT (icmp0->type == ICMP6_echo_request); + ASSERT (icmp1->type == ICMP6_echo_request); + sum0 = ip_csum_update (sum0, ICMP6_echo_request, ICMP6_echo_reply, + icmp46_header_t, type); + sum1 = ip_csum_update (sum1, ICMP6_echo_request, ICMP6_echo_reply, + icmp46_header_t, type); + + icmp0->checksum = ip_csum_fold (sum0); + icmp1->checksum = ip_csum_fold (sum1); + + icmp0->type = ICMP6_echo_reply; + icmp1->type = ICMP6_echo_reply; + + /* Swap source and destination address. */ + tmp0 = ip0->src_address; + tmp1 = ip1->src_address; + + ip0->src_address = ip0->dst_address; + ip1->src_address = ip1->dst_address; + + ip0->dst_address = tmp0; + ip1->dst_address = tmp1; + + /* New hop count. */ + ip0->hop_limit = im->host_config.ttl; + ip1->hop_limit = im->host_config.ttl; + + if (ip6_address_is_link_local_unicast (&ip0->dst_address)) + { + ethernet_header_t *eth0; + u8 tmp_mac[6]; + /* For link local, reuse current MAC header by sawpping + * SMAC to DMAC instead of IP6 lookup since link local + * is not in the IP6 FIB */ + vlib_buffer_reset (p0); + eth0 = vlib_buffer_get_current (p0); + memcpy (tmp_mac, eth0->dst_address, 6); + memcpy (eth0->dst_address, eth0->src_address, 6); + memcpy (eth0->src_address, tmp_mac, 6); + vnet_buffer(p0)->sw_if_index[VLIB_TX] = + vnet_buffer (p0)->sw_if_index[VLIB_RX]; + next0 = ICMP6_ECHO_REQUEST_NEXT_OUTPUT; + } + else + { + /* Determine the correct lookup fib indices... */ + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer (p0)->sw_if_index[VLIB_RX]); + vnet_buffer (p0)->sw_if_index[VLIB_TX] = fib_index0; + } + + if (ip6_address_is_link_local_unicast (&ip1->dst_address)) + { + ethernet_header_t *eth1; + u8 tmp_mac[6]; + /* For link local, reuse current MAC header by sawpping + * SMAC to DMAC instead of IP6 lookup since link local + * is not in the IP6 FIB */ + vlib_buffer_reset (p1); + eth1 = vlib_buffer_get_current (p1); + memcpy (tmp_mac, eth1->dst_address, 6); + memcpy (eth1->dst_address, eth1->src_address, 6); + memcpy (eth1->src_address, tmp_mac, 6); + vnet_buffer(p1)->sw_if_index[VLIB_TX] = + vnet_buffer (p1)->sw_if_index[VLIB_RX]; + next1 = ICMP6_ECHO_REQUEST_NEXT_OUTPUT; + } + else + { + /* Determine the correct lookup fib indices... */ + fib_index1 = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer (p1)->sw_if_index[VLIB_RX]); + vnet_buffer (p1)->sw_if_index[VLIB_TX] = fib_index1; + } + + vnet_buffer (p0)->sw_if_index[VLIB_RX] + = vnet_main.local_interface_sw_if_index; + vnet_buffer (p1)->sw_if_index[VLIB_RX] + = vnet_main.local_interface_sw_if_index; + + /* verify speculative enqueues, maybe switch current next frame */ + /* if next0==next1==next_index then nothing special needs to be done */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip0; + icmp46_header_t * icmp0; + u32 bi0; + ip6_address_t tmp0; + ip_csum_t sum0; + u32 fib_index0; + u32 next0 = ICMP6_ECHO_REQUEST_NEXT_LOOKUP; + + bi0 = to_next[0] = from[0]; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (p0); + icmp0 = ip6_next_header (ip0); + + /* Check icmp type to echo reply and update icmp checksum. */ + sum0 = icmp0->checksum; + + ASSERT (icmp0->type == ICMP6_echo_request); + sum0 = ip_csum_update (sum0, ICMP6_echo_request, ICMP6_echo_reply, + icmp46_header_t, type); + + icmp0->checksum = ip_csum_fold (sum0); + + icmp0->type = ICMP6_echo_reply; + + /* Swap source and destination address. */ + tmp0 = ip0->src_address; + ip0->src_address = ip0->dst_address; + ip0->dst_address = tmp0; + + ip0->hop_limit = im->host_config.ttl; + + if (ip6_address_is_link_local_unicast (&ip0->dst_address)) + { + ethernet_header_t *eth0; + u8 tmp_mac[6]; + /* For link local, reuse current MAC header by sawpping + * SMAC to DMAC instead of IP6 lookup since link local + * is not in the IP6 FIB */ + vlib_buffer_reset (p0); + eth0 = vlib_buffer_get_current (p0); + memcpy (tmp_mac, eth0->dst_address, 6); + memcpy (eth0->dst_address, eth0->src_address, 6); + memcpy (eth0->src_address, tmp_mac, 6); + vnet_buffer(p0)->sw_if_index[VLIB_TX] = + vnet_buffer (p0)->sw_if_index[VLIB_RX]; + next0 = ICMP6_ECHO_REQUEST_NEXT_OUTPUT; + } + else + { + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer (p0)->sw_if_index[VLIB_RX]); + vnet_buffer (p0)->sw_if_index[VLIB_TX] = fib_index0; + } + vnet_buffer (p0)->sw_if_index[VLIB_RX] + = vnet_main.local_interface_sw_if_index; + + /* Verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_error_count (vm, ip6_icmp_input_node.index, + ICMP6_ERROR_ECHO_REPLIES_SENT, + frame->n_vectors); + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip6_icmp_echo_request_node,static) = { + .function = ip6_icmp_echo_request, + .name = "ip6-icmp-echo-request", + + .vector_size = sizeof (u32), + + .format_trace = format_icmp6_input_trace, + + .n_next_nodes = ICMP6_ECHO_REQUEST_N_NEXT, + .next_nodes = { + [ICMP6_ECHO_REQUEST_NEXT_LOOKUP] = "ip6-lookup", + [ICMP6_ECHO_REQUEST_NEXT_OUTPUT] = "interface-output", + }, +}; + +typedef enum { + ICMP6_TTL_EXPIRE_NEXT_DROP, + ICMP6_TTL_EXPIRE_NEXT_LOOKUP, + ICMP6_TTL_EXPIRE_N_NEXT, +} icmp_ttl_expire_next_t; + +static uword +ip6_icmp_ttl_expire (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 * from, * to_next; + uword n_left_from, n_left_to_next; + icmp_ttl_expire_next_t next_index; + ip6_main_t *im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + + from = vlib_frame_vector_args(frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, sizeof (icmp6_input_trace_t)); + + while (n_left_from > 0) + { + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 pi0 = from[0]; + u32 next0 = ICMP6_TTL_EXPIRE_NEXT_LOOKUP; + u8 error0 = ICMP6_ERROR_TTL_EXPIRE_RESP_SENT; + vlib_buffer_t * p0; + ip6_header_t * ip0, * out_ip0; + icmp46_header_t * icmp0; + u32 sw_if_index0, if_add_index0; + int bogus_length; + + /* Speculatively enqueue p0 to the current next frame */ + to_next[0] = pi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer(vm, pi0); + ip0 = vlib_buffer_get_current(p0); + sw_if_index0 = vnet_buffer(p0)->sw_if_index[VLIB_RX]; + + /* RFC2463 says to keep as much of the original packet as possible + * within the MTU. We cheat "a little" here by keeping whatever fits + * in the first buffer, to be more efficient */ + if (PREDICT_FALSE(p0->total_length_not_including_first_buffer)) + { /* clear current_length of all other buffers in chain */ + vlib_buffer_t *b = p0; + p0->total_length_not_including_first_buffer = 0; + while (b->flags & VLIB_BUFFER_NEXT_PRESENT) + { + b = vlib_get_buffer (vm, b->next_buffer); + b->current_length = 0; + } + } + + /* Add IP header and ICMPv6 header including a 4 byte ununsed field */ + vlib_buffer_advance(p0, + -sizeof(ip6_header_t)-sizeof(icmp46_header_t)-4); + out_ip0 = vlib_buffer_get_current(p0); + icmp0 = (icmp46_header_t *) &out_ip0[1]; + + /* Fill ip header fields */ + out_ip0->ip_version_traffic_class_and_flow_label = + clib_host_to_net_u32(0x6<<28); + out_ip0->payload_length = + clib_host_to_net_u16(p0->current_length - sizeof(ip6_header_t)); + out_ip0->protocol = IP_PROTOCOL_ICMP6; + out_ip0->hop_limit = 0xff; + out_ip0->dst_address = ip0->src_address; + if_add_index0 = + lm->if_address_pool_index_by_sw_if_index[sw_if_index0]; + if (PREDICT_TRUE(if_add_index0 != ~0)) + { + ip_interface_address_t *if_add = + pool_elt_at_index(lm->if_address_pool, if_add_index0); + ip6_address_t *if_ip = + ip_interface_address_get_address(lm, if_add); + out_ip0->src_address = *if_ip; + vlib_error_count (vm, node->node_index, error0, 1); + } + else /* interface has no IP6 address - should not happen */ + { + next0 = ICMP6_TTL_EXPIRE_NEXT_DROP; + error0 = ICMP6_ERROR_TTL_EXPIRE_RESP_DROP; + } + + /* Fill icmp header fields */ + icmp0->type = ICMP6_time_exceeded; + icmp0->code = ICMP6_time_exceeded_ttl_exceeded_in_transit; + icmp0->checksum = 0; + icmp0->checksum = ip6_tcp_udp_icmp_compute_checksum( + vm, p0, out_ip0, &bogus_length); + + /* Update error status */ + p0->error = node->errors[error0]; + + /* Verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1(vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip6_icmp_ttl_expire_node) = { + .function = ip6_icmp_ttl_expire, + .name = "ip6-icmp-ttl-expire", + .vector_size = sizeof (u32), + + .n_errors = ARRAY_LEN (icmp_error_strings), + .error_strings = icmp_error_strings, + + .n_next_nodes = ICMP6_TTL_EXPIRE_N_NEXT, + .next_nodes = { + [ICMP6_TTL_EXPIRE_NEXT_DROP] = "error-drop", + [ICMP6_TTL_EXPIRE_NEXT_LOOKUP] = "ip6-lookup", + }, + + .format_trace = format_icmp6_input_trace, +}; + + +static uword unformat_icmp_type_and_code (unformat_input_t * input, va_list * args) +{ + icmp46_header_t * h = va_arg (*args, icmp46_header_t *); + icmp6_main_t * cm = &icmp6_main; + u32 i; + + if (unformat_user (input, unformat_vlib_number_by_name, + cm->type_and_code_by_name, &i)) + { + h->type = (i >> 8) & 0xff; + h->code = (i >> 0) & 0xff; + } + else if (unformat_user (input, unformat_vlib_number_by_name, + cm->type_by_name, &i)) + { + h->type = i; + h->code = 0; + } + else + return 0; + + return 1; +} + +static void +icmp6_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, + u32 n_packets) +{ + vlib_main_t * vm = pg->vlib_main; + u32 ip_offset, icmp_offset; + int bogus_length; + + icmp_offset = g->start_byte_offset; + ip_offset = (g-1)->start_byte_offset; + + while (n_packets >= 1) + { + vlib_buffer_t * p0; + ip6_header_t * ip0; + icmp46_header_t * icmp0; + + p0 = vlib_get_buffer (vm, packets[0]); + n_packets -= 1; + packets += 1; + + ASSERT (p0->current_data == 0); + ip0 = (void *) (p0->data + ip_offset); + icmp0 = (void *) (p0->data + icmp_offset); + + icmp0->checksum = ip6_tcp_udp_icmp_compute_checksum (vm, p0, ip0, + &bogus_length); + ASSERT (bogus_length == 0); + } +} + +typedef struct { + pg_edit_t type, code; + pg_edit_t checksum; +} pg_icmp46_header_t; + +always_inline void +pg_icmp_header_init (pg_icmp46_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, icmp46_header_t, f); + _ (type); + _ (code); + _ (checksum); +#undef _ +} + +static uword +unformat_pg_icmp_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t * s = va_arg (*args, pg_stream_t *); + pg_icmp46_header_t * p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (icmp46_header_t), + &group_index); + pg_icmp_header_init (p); + + p->checksum.type = PG_EDIT_UNSPECIFIED; + + { + icmp46_header_t tmp; + + if (! unformat (input, "ICMP %U", unformat_icmp_type_and_code, &tmp)) + goto error; + + pg_edit_set_fixed (&p->type, tmp.type); + pg_edit_set_fixed (&p->code, tmp.code); + } + + /* Parse options. */ + while (1) + { + if (unformat (input, "checksum %U", + unformat_pg_edit, + unformat_pg_number, &p->checksum)) + ; + + /* Can't parse input: try next protocol level. */ + else + break; + } + + if (! unformat_user (input, unformat_pg_payload, s)) + goto error; + + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t * g = pg_stream_get_group (s, group_index); + g->edit_function = icmp6_pg_edit_function; + g->edit_function_opaque = 0; + } + + return 1; + + error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + +void icmp6_register_type (vlib_main_t * vm, icmp6_type_t type, u32 node_index) +{ + icmp6_main_t * im = &icmp6_main; + + ASSERT (type < ARRAY_LEN (im->input_next_index_by_type)); + im->input_next_index_by_type[type] + = vlib_node_add_next (vm, ip6_icmp_input_node.index, node_index); +} + +static clib_error_t * +icmp6_init (vlib_main_t * vm) +{ + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi; + icmp6_main_t * cm = &icmp6_main; + clib_error_t * error; + + error = vlib_call_init_function (vm, ip_main_init); + + if (error) + return error; + + pi = ip_get_protocol_info (im, IP_PROTOCOL_ICMP6); + pi->format_header = format_icmp6_header; + pi->unformat_pg_edit = unformat_pg_icmp_header; + + cm->type_by_name = hash_create_string (0, sizeof (uword)); +#define _(n,t) hash_set_mem (cm->type_by_name, #t, (n)); + foreach_icmp6_type; +#undef _ + + cm->type_and_code_by_name = hash_create_string (0, sizeof (uword)); +#define _(a,n,t) hash_set_mem (cm->type_by_name, #t, (n) | (ICMP6_##a << 8)); + foreach_icmp6_code; +#undef _ + + memset (cm->input_next_index_by_type, + ICMP_INPUT_NEXT_DROP, + sizeof (cm->input_next_index_by_type)); + memset (cm->max_valid_code_by_type, 0, sizeof (cm->max_valid_code_by_type)); + +#define _(a,n,t) cm->max_valid_code_by_type[ICMP6_##a] = clib_max (cm->max_valid_code_by_type[ICMP6_##a], n); + foreach_icmp6_code; +#undef _ + + memset (cm->min_valid_hop_limit_by_type, 0, sizeof (cm->min_valid_hop_limit_by_type)); + cm->min_valid_hop_limit_by_type[ICMP6_router_solicitation] = 255; + cm->min_valid_hop_limit_by_type[ICMP6_router_advertisement] = 255; + cm->min_valid_hop_limit_by_type[ICMP6_neighbor_solicitation] = 255; + cm->min_valid_hop_limit_by_type[ICMP6_neighbor_advertisement] = 255; + cm->min_valid_hop_limit_by_type[ICMP6_redirect] = 255; + + memset (cm->min_valid_length_by_type, sizeof (icmp46_header_t), sizeof (cm->min_valid_length_by_type)); + cm->min_valid_length_by_type[ICMP6_router_solicitation] = sizeof (icmp6_neighbor_discovery_header_t); + cm->min_valid_length_by_type[ICMP6_router_advertisement] = sizeof (icmp6_router_advertisement_header_t); + cm->min_valid_length_by_type[ICMP6_neighbor_solicitation] + = sizeof (icmp6_neighbor_solicitation_or_advertisement_header_t); + cm->min_valid_length_by_type[ICMP6_neighbor_advertisement] + = sizeof (icmp6_neighbor_solicitation_or_advertisement_header_t); + cm->min_valid_length_by_type[ICMP6_redirect] = sizeof (icmp6_redirect_header_t); + + icmp6_register_type (vm, ICMP6_echo_request, ip6_icmp_echo_request_node.index); + + return vlib_call_init_function (vm, ip6_neighbor_init); +} + +VLIB_INIT_FUNCTION (icmp6_init); diff --git a/vnet/vnet/ip/icmp6.h b/vnet/vnet/ip/icmp6.h new file mode 100644 index 00000000000..92f6913a454 --- /dev/null +++ b/vnet/vnet/ip/icmp6.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef included_vnet_icmp6_h +#define included_vnet_icmp6_h + +#define foreach_icmp6_error \ + _ (NONE, "valid packets") \ + _ (UNKNOWN_TYPE, "unknown type") \ + _ (INVALID_CODE_FOR_TYPE, "invalid code for type") \ + _ (INVALID_HOP_LIMIT_FOR_TYPE, "hop_limit != 255") \ + _ (LENGTH_TOO_SMALL_FOR_TYPE, "payload length too small for type") \ + _ (OPTIONS_WITH_ODD_LENGTH, \ + "total option length not multiple of 8 bytes") \ + _ (OPTION_WITH_ZERO_LENGTH, "option has zero length") \ + _ (ECHO_REPLIES_SENT, "echo replies sent") \ + _ (NEIGHBOR_SOLICITATION_SOURCE_NOT_ON_LINK, \ + "neighbor solicitations from source not on link") \ + _ (NEIGHBOR_SOLICITATION_SOURCE_UNKNOWN, \ + "neighbor solicitations for unknown targets") \ + _ (NEIGHBOR_ADVERTISEMENTS_TX, "neighbor advertisements sent") \ + _ (NEIGHBOR_ADVERTISEMENTS_RX, "neighbor advertisements received") \ + _ (ROUTER_SOLICITATION_SOURCE_NOT_ON_LINK, \ + "router solicitations from source not on link") \ + _ (ROUTER_SOLICITATION_UNSUPPORTED_INTF, \ + "neighbor discovery unsupported interface") \ + _ (ROUTER_SOLICITATION_RADV_NOT_CONFIG, \ + "neighbor discovery not configured") \ + _ (ROUTER_SOLICITATION_DEST_UNKNOWN, \ + "router solicitations for unknown destination") \ + _ (ROUTER_SOLICITATION_SOURCE_UNKNOWN, \ + "router solicitations for unknown source") \ + _ (ROUTER_ADVERTISEMENT_SOURCE_NOT_LINK_LOCAL, \ + "router advertisement source not link local") \ + _ (ROUTER_ADVERTISEMENTS_TX, "router advertisements sent") \ + _ (ROUTER_ADVERTISEMENTS_RX, "router advertisements received") \ + _ (DST_LOOKUP_MISS, "icmp6 dst address lookup misses") \ + _ (TTL_EXPIRE_RESP_SENT, "TTL time exceeded response sent") \ + _ (TTL_EXPIRE_RESP_DROP, "TTL time exceeded response dropped") + + +typedef enum { +#define _(f,s) ICMP6_ERROR_##f, + foreach_icmp6_error +#undef _ +} icmp6_error_t; + +typedef struct { + u8 packet_data[64]; +} icmp6_input_trace_t; + +format_function_t format_icmp6_input_trace; +void icmp6_register_type (vlib_main_t * vm, icmp6_type_t type, u32 node_index); + +extern vlib_node_registration_t ip6_icmp_input_node; + +#endif /* included_vnet_icmp6_h */ + + diff --git a/vnet/vnet/ip/igmp_packet.h b/vnet/vnet/ip/igmp_packet.h new file mode 100644 index 00000000000..00b1e0deeb7 --- /dev/null +++ b/vnet/vnet/ip/igmp_packet.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * igmp_packet.h: igmp packet format + * + * Copyright (c) 2011 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_vnet_igmp_packet_h +#define included_vnet_igmp_packet_h + +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/ip6_packet.h> + +#define foreach_igmp_type \ + _ (0x11, membership_query) \ + _ (0x12, membership_report_v1) \ + _ (0x13, dvmrp) \ + _ (0x14, pim_v1) \ + _ (0x15, cisco_trace) \ + _ (0x16, membership_report_v2) \ + _ (0x17, leave_group_v2) \ + _ (0x1e, traceroute_response) \ + _ (0x1f, traceroute_request) \ + _ (0x22, membership_report_v3) \ + _ (0x30, router_advertisement) \ + _ (0x31, router_solicitation) \ + _ (0x32, router_termination) + +typedef enum { +#define _(n,f) IGMP_TYPE_##f = n, + foreach_igmp_type +#undef _ +} igmp_type_t; + +typedef struct { + igmp_type_t type : 8; + + u8 code; + + u16 checksum; +} igmp_header_t; + +typedef struct { + /* membership_query, version <= 2 reports. */ + igmp_header_t header; + + /* Multicast destination address. */ + ip4_address_t dst; +} igmp_message_t; + +#define foreach_igmp_membership_group_v3_type \ + _ (1, mode_is_filter_include) \ + _ (2, mode_is_filter_exclude) \ + _ (3, change_to_filter_include) \ + _ (4, change_to_filter_exclude) \ + _ (5, allow_new_sources) \ + _ (6, block_old_sources) + +typedef enum { +#define _(n,f) IGMP_MEMBERSHIP_GROUP_##f = n, + foreach_igmp_membership_group_v3_type +#undef _ +} igmp_membership_group_v3_type_t; + +typedef struct { + igmp_membership_group_v3_type_t type : 8; + + /* Number of 32 bit words of aux data after source addresses. */ + u8 n_aux_u32s; + + /* Number of source addresses that follow. */ + u16 n_src_addresses; + + /* Destination multicast address. */ + ip4_address_t dst_address; + + ip4_address_t src_addresses[0]; +} igmp_membership_group_v3_t; + +always_inline igmp_membership_group_v3_t * +igmp_membership_group_v3_next (igmp_membership_group_v3_t * g) +{ + return ((void *) g + + g->n_src_addresses * sizeof (g->src_addresses[0]) + + g->n_aux_u32s * sizeof (u32)); +} + +typedef struct { + /* Type 0x22. */ + igmp_header_t header; + + u16 unused; + + /* Number of groups which follow. */ + u16 n_groups; + + igmp_membership_group_v3_t groups[0]; +} igmp_membership_report_v3_t; + +/* IP6 flavor of IGMP is called MLD which is embedded in ICMP6. */ +typedef struct { + /* Preceeded by ICMP v6 header. */ + u16 max_response_delay_in_milliseconds; + u16 reserved; + ip6_address_t dst; +} mld_header_t; + +#endif /* included_vnet_igmp_packet_h */ diff --git a/vnet/vnet/ip/ip.h b/vnet/vnet/ip/ip.h new file mode 100644 index 00000000000..e47512a960d --- /dev/null +++ b/vnet/vnet/ip/ip.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip.h: ip generic (4 or 6) main + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip_main_h +#define included_ip_main_h + +#include <vppinfra/hash.h> +#include <vppinfra/heap.h> /* adjacency heap */ + +#include <vnet/vnet.h> + +#include <vnet/ip/format.h> +#include <vnet/ip/ip_packet.h> +#include <vnet/ip/lookup.h> + +#include <vnet/ip/tcp_packet.h> +#include <vnet/ip/udp_packet.h> +#include <vnet/ip/icmp46_packet.h> + +#include <vnet/ip/ip4.h> +#include <vnet/ip/ip4_error.h> +#include <vnet/ip/ip4_packet.h> + +#include <vnet/ip/ip6.h> +#include <vnet/ip/ip6_packet.h> +#include <vnet/ip/ip6_error.h> +#include <vnet/ip/icmp6.h> + +#include <vnet/ip/tcp.h> + +#if DPDK > 0 +#include <vnet/devices/dpdk/dpdk.h> +#endif + +#include <vnet/classify/vnet_classify.h> + +typedef union { + ip4_address_t ip4; + ip6_address_t ip6; +} ip46_address_t; + +/* Per protocol info. */ +typedef struct { + /* Protocol name (also used as hash key). */ + u8 * name; + + /* Protocol number. */ + ip_protocol_t protocol; + + /* Format function for this IP protocol. */ + format_function_t * format_header; + + /* Parser for header. */ + unformat_function_t * unformat_header; + + /* Parser for per-protocol matches. */ + unformat_function_t * unformat_match; + + /* Parser for packet generator edits for this protocol. */ + unformat_function_t * unformat_pg_edit; +} ip_protocol_info_t; + +/* Per TCP/UDP port info. */ +typedef struct { + /* Port name (used as hash key). */ + u8 * name; + + /* UDP/TCP port number in network byte order. */ + u16 port; + + /* Port specific format function. */ + format_function_t * format_header; + + /* Parser for packet generator edits for this protocol. */ + unformat_function_t * unformat_pg_edit; +} tcp_udp_port_info_t; + +typedef struct { + /* Per IP protocol info. */ + ip_protocol_info_t * protocol_infos; + + /* Protocol info index hashed by 8 bit IP protocol. */ + uword * protocol_info_by_protocol; + + /* Hash table mapping IP protocol name (see protocols.def) + to protocol number. */ + uword * protocol_info_by_name; + + /* Per TCP/UDP port info. */ + tcp_udp_port_info_t * port_infos; + + /* Hash table from network-byte-order port to port info index. */ + uword * port_info_by_port; + + /* Hash table mapping TCP/UDP name to port info index. */ + uword * port_info_by_name; +} ip_main_t; + +extern ip_main_t ip_main; + +clib_error_t * +ip_main_init (vlib_main_t * vm); + +static inline ip_protocol_info_t * +ip_get_protocol_info (ip_main_t * im, u32 protocol) +{ + uword * p; + + p = hash_get (im->protocol_info_by_protocol, protocol); + return p ? vec_elt_at_index (im->protocol_infos, p[0]) : 0; +} + +static inline tcp_udp_port_info_t * +ip_get_tcp_udp_port_info (ip_main_t * im, u32 port) +{ + uword * p; + + p = hash_get (im->port_info_by_port, port); + return p ? vec_elt_at_index (im->port_infos, p[0]) : 0; +} + +always_inline ip_csum_t +ip_incremental_checksum_buffer (vlib_main_t * vm, vlib_buffer_t * first_buffer, + u32 first_buffer_offset, + u32 n_bytes_to_checksum, + ip_csum_t sum) +#if DPDK > 0 +{ + u32 n_bytes_left = n_bytes_to_checksum; + struct rte_mbuf * mb = ((struct rte_mbuf *)first_buffer)-1; + u8 nb_segs = mb->nb_segs; + ASSERT(mb->data_len >= first_buffer_offset); + void * h; + u32 n; + + n = clib_min (n_bytes_left, mb->data_len); + h = vlib_buffer_get_current (first_buffer) + first_buffer_offset; + while (n_bytes_left) + { + sum = ip_incremental_checksum (sum, h, n); + n_bytes_left -= n; + nb_segs--; + mb = mb->next; + if ((nb_segs == 0) || (mb == 0)) + break; + + n = clib_min (n_bytes_left, mb->data_len); + h = rte_ctrlmbuf_data(mb); + } + + ASSERT(n_bytes_left == 0); + ASSERT(nb_segs == 0); + return sum; +} +#else +{ + vlib_buffer_t * b = first_buffer; + u32 n_bytes_left = n_bytes_to_checksum; + ASSERT (b->current_length >= first_buffer_offset); + void * h; + u32 n; + + n = clib_min (n_bytes_left, b->current_length); + h = vlib_buffer_get_current (b) + first_buffer_offset; + sum = ip_incremental_checksum (sum, h, n); + if (PREDICT_FALSE (b->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + while (1) + { + n_bytes_left -= n; + if (n_bytes_left == 0) + break; + b = vlib_get_buffer (vm, b->next_buffer); + n = clib_min (n_bytes_left, b->current_length); + h = vlib_buffer_get_current (b); + sum = ip_incremental_checksum (sum, h, n); + } + } + + return sum; +} +#endif /* DPDK */ + +void ip_del_all_interface_addresses (vlib_main_t *vm, u32 sw_if_index); + +#endif /* included_ip_main_h */ diff --git a/vnet/vnet/ip/ip4.h b/vnet/vnet/ip/ip4.h new file mode 100644 index 00000000000..6b8fd59a022 --- /dev/null +++ b/vnet/vnet/ip/ip4.h @@ -0,0 +1,409 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4.h: ip4 main include file + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip_ip4_h +#define included_ip_ip4_h + +#include <vnet/ip/ip4_mtrie.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/ip/lookup.h> + +typedef struct ip4_fib_t { + /* Hash table for each prefix length mapping. */ + uword * adj_index_by_dst_address[33]; + + /* Temporary vectors for holding new/old values for hash_set. */ + uword * new_hash_values, * old_hash_values; + + /* Mtrie for fast lookups. Hash is used to maintain overlapping prefixes. */ + ip4_fib_mtrie_t mtrie; + + /* Table ID (hash key) for this FIB. */ + u32 table_id; + + /* Index into FIB vector. */ + u32 index; + + /* flow hash configuration */ + u32 flow_hash_config; + + /* N-tuple classifier indices */ + u32 fwd_classify_table_index; + u32 rev_classify_table_index; + +} ip4_fib_t; + +struct ip4_main_t; + +typedef void (ip4_add_del_route_function_t) + (struct ip4_main_t * im, + uword opaque, + ip4_fib_t * fib, + u32 flags, + ip4_address_t * address, + u32 address_length, + void * old_result, + void * new_result); + +typedef struct { + ip4_add_del_route_function_t * function; + uword required_flags; + uword function_opaque; +} ip4_add_del_route_callback_t; + +typedef void (ip4_add_del_interface_address_function_t) + (struct ip4_main_t * im, + uword opaque, + u32 sw_if_index, + ip4_address_t * address, + u32 address_length, + u32 if_address_index, + u32 is_del); + +typedef struct { + ip4_add_del_interface_address_function_t * function; + uword function_opaque; +} ip4_add_del_interface_address_callback_t; + +typedef enum { + /* First check access list to either permit or deny this + packet based on classification. */ + IP4_RX_FEATURE_CHECK_ACCESS, + + /* RPF check: verify that source address is reachable via + RX interface or via any interface. */ + IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_RX, + IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_ANY, + + /* IPSec */ + IP4_RX_FEATURE_IPSEC, + + /* vPath forwarding: won't return to call next feature + so any feature needed before vPath forwarding must be prior + to this entry */ + IP4_RX_FEATURE_VPATH, + + /* Must be last: perform forwarding lookup. */ + IP4_RX_FEATURE_LOOKUP, + + IP4_N_RX_FEATURE, +} ip4_rx_feature_type_t; + +typedef struct ip4_main_t { + ip_lookup_main_t lookup_main; + + /* Vector of FIBs. */ + ip4_fib_t * fibs; + + u32 fib_masks[33]; + + /* Table index indexed by software interface. */ + u32 * fib_index_by_sw_if_index; + + /* Hash table mapping table id to fib index. + ID space is not necessarily dense; index space is dense. */ + uword * fib_index_by_table_id; + + /* Vector of functions to call when routes are added/deleted. */ + ip4_add_del_route_callback_t * add_del_route_callbacks; + + /* Hash table mapping interface route rewrite adjacency index by sw if index. */ + uword * interface_route_adj_index_by_sw_if_index; + + /* Functions to call when interface address changes. */ + ip4_add_del_interface_address_callback_t * add_del_interface_address_callbacks; + + /* Template used to generate IP4 ARP packets. */ + vlib_packet_template_t ip4_arp_request_packet_template; + + /* Seed for Jenkins hash used to compute ip4 flow hash. */ + u32 flow_hash_seed; + + struct { + /* TTL to use for host generated packets. */ + u8 ttl; + + /* TOS byte to use for host generated packets. */ + u8 tos; + + u8 pad[2]; + } host_config; +} ip4_main_t; + +/* Global ip4 main structure. */ +extern ip4_main_t ip4_main; + +/* Global ip4 input node. Errors get attached to ip4 input node. */ +extern vlib_node_registration_t ip4_input_node; +extern vlib_node_registration_t ip4_lookup_node; +extern vlib_node_registration_t ip4_rewrite_node; +extern vlib_node_registration_t ip4_arp_node; + +u32 ip4_fib_lookup_with_table (ip4_main_t * im, u32 fib_index, ip4_address_t * dst, + u32 disable_default_route); + +always_inline u32 +ip4_fib_lookup_buffer (ip4_main_t * im, u32 fib_index, ip4_address_t * dst, + vlib_buffer_t * b) +{ + return ip4_fib_lookup_with_table (im, fib_index, dst, + /* disable_default_route */ 0); +} + +always_inline u32 +ip4_fib_lookup (ip4_main_t * im, u32 sw_if_index, ip4_address_t * dst) +{ + u32 fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index); + return ip4_fib_lookup_with_table (im, fib_index, dst, + /* disable_default_route */ 0); +} + +always_inline uword +ip4_destination_matches_route (ip4_main_t * im, + ip4_address_t * key, + ip4_address_t * dest, + uword dest_length) +{ return 0 == ((key->data_u32 ^ dest->data_u32) & im->fib_masks[dest_length]); } + +always_inline uword +ip4_destination_matches_interface (ip4_main_t * im, + ip4_address_t * key, + ip_interface_address_t * ia) +{ + ip4_address_t * a = ip_interface_address_get_address (&im->lookup_main, ia); + return ip4_destination_matches_route (im, key, a, ia->address_length); +} + +/* As above but allows for unaligned destinations (e.g. works right from IP header of packet). */ +always_inline uword +ip4_unaligned_destination_matches_route (ip4_main_t * im, + ip4_address_t * key, + ip4_address_t * dest, + uword dest_length) +{ return 0 == ((clib_mem_unaligned (&key->data_u32, u32) ^ dest->data_u32) & im->fib_masks[dest_length]); } + +always_inline void +ip4_src_address_for_packet (ip4_main_t * im, vlib_buffer_t * p, ip4_address_t * src, u32 sw_if_index) +{ + ip_lookup_main_t * lm = &im->lookup_main; + ip_interface_address_t * ia = ip_interface_address_for_packet (lm, p, sw_if_index); + ip4_address_t * a = ip_interface_address_get_address (lm, ia); + *src = a[0]; +} + +/* Find interface address which matches destination. */ +always_inline ip4_address_t * +ip4_interface_address_matching_destination (ip4_main_t * im, ip4_address_t * dst, u32 sw_if_index, + ip_interface_address_t ** result_ia) +{ + ip_lookup_main_t * lm = &im->lookup_main; + ip_interface_address_t * ia; + ip4_address_t * result = 0; + + foreach_ip_interface_address (lm, ia, sw_if_index, + 1 /* honor unnumbered */, + ({ + ip4_address_t * a = ip_interface_address_get_address (lm, ia); + if (ip4_destination_matches_route (im, dst, a, ia->address_length)) + { + result = a; + break; + } + })); + if (result_ia) + *result_ia = result ? ia : 0; + return result; +} + +clib_error_t * +ip4_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index, + ip4_address_t * address, u32 address_length, + u32 is_del); + +int ip4_address_compare (ip4_address_t * a1, ip4_address_t * a2); + +/* Add/del a route to the FIB. */ + +#define IP4_ROUTE_FLAG_ADD (0 << 0) +#define IP4_ROUTE_FLAG_DEL (1 << 0) +#define IP4_ROUTE_FLAG_TABLE_ID (0 << 1) +#define IP4_ROUTE_FLAG_FIB_INDEX (1 << 1) +#define IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY (1 << 2) +#define IP4_ROUTE_FLAG_NO_REDISTRIBUTE (1 << 3) +/* Not last add/del in group. Facilities batching requests into packets. */ +#define IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP (1 << 4) +/* Dynamic route created via ARP reply. */ +#define IP4_ROUTE_FLAG_NEIGHBOR (1 << 5) + +typedef struct { + /* IP4_ROUTE_FLAG_* */ + u32 flags; + + /* Either index of fib or table_id to hash and get fib. + IP4_ROUTE_FLAG_FIB_INDEX specifies index; otherwise table_id is assumed. */ + u32 table_index_or_table_id; + + /* Destination address (prefix) and length. */ + ip4_address_t dst_address; + u32 dst_address_length; + + /* Adjacency to use for this destination. */ + u32 adj_index; + + /* If specified adjacencies to add and then + use for this destination. add_adj/n_add_adj + are override adj_index if specified. */ + ip_adjacency_t * add_adj; + u32 n_add_adj; +} ip4_add_del_route_args_t; + +ip4_fib_t * +find_ip4_fib_by_table_index_or_id (ip4_main_t * im, + u32 table_index_or_id, u32 flags); + +void ip4_add_del_route (ip4_main_t * im, ip4_add_del_route_args_t * args); + +void ip4_add_del_route_next_hop (ip4_main_t * im, + u32 flags, + ip4_address_t * dst_address, + u32 dst_address_length, + ip4_address_t * next_hop, + u32 next_hop_sw_if_index, + u32 next_hop_weight, u32 adj_index, + u32 explicit_fib_index); + +void * +ip4_get_route (ip4_main_t * im, + u32 fib_index_or_table_id, + u32 flags, + u8 * address, + u32 address_length); + +void +ip4_foreach_matching_route (ip4_main_t * im, + u32 table_index_or_table_id, + u32 flags, + ip4_address_t * address, + u32 address_length, + ip4_address_t ** results, + u8 ** result_lengths); + +void ip4_delete_matching_routes (ip4_main_t * im, + u32 table_index_or_table_id, + u32 flags, + ip4_address_t * address, + u32 address_length); + +void ip4_maybe_remap_adjacencies (ip4_main_t * im, + u32 table_index_or_table_id, + u32 flags); + +void ip4_adjacency_set_interface_route (vnet_main_t * vnm, + ip_adjacency_t * adj, + u32 sw_if_index, + u32 if_address_index); + +/* Send an ARP request to see if given destination is reachable on given interface. */ +clib_error_t * +ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index); + +clib_error_t * +ip4_set_arp_limit (u32 arp_limit); + +uword +ip4_tcp_register_listener (vlib_main_t * vm, + u16 dst_port, + u32 next_node_index); +uword +ip4_udp_register_listener (vlib_main_t * vm, + u16 dst_port, + u32 next_node_index); + +void +ip4_icmp_register_type (vlib_main_t * vm, icmp4_type_t type, + u32 node_index); + +u16 ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, ip4_header_t * ip0); + +void ip4_register_protocol (u32 protocol, u32 node_index); + +serialize_function_t serialize_vnet_ip4_main, unserialize_vnet_ip4_main; + +int vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config); + +void ip4_mtrie_init (ip4_fib_mtrie_t * m); + +int vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index, + u32 table_index); + +/* Compute flow hash. We'll use it to select which adjacency to use for this + flow. And other things. */ +always_inline u32 +ip4_compute_flow_hash (ip4_header_t * ip, u32 flow_hash_config) +{ + tcp_header_t * tcp = (void *) (ip + 1); + u32 a, b, c, t1, t2; + uword is_tcp_udp = (ip->protocol == IP_PROTOCOL_TCP + || ip->protocol == IP_PROTOCOL_UDP); + + t1 = (flow_hash_config & IP_FLOW_HASH_SRC_ADDR) + ? ip->src_address.data_u32 : 0; + t2 = (flow_hash_config & IP_FLOW_HASH_DST_ADDR) + ? ip->dst_address.data_u32 : 0; + + a = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t2 : t1; + b = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t1 : t2; + b ^= (flow_hash_config & IP_FLOW_HASH_PROTO) ? ip->protocol : 0; + + t1 = is_tcp_udp ? tcp->ports.src : 0; + t2 = is_tcp_udp ? tcp->ports.dst : 0; + + t1 = (flow_hash_config & IP_FLOW_HASH_SRC_PORT) ? t1 : 0; + t2 = (flow_hash_config & IP_FLOW_HASH_DST_PORT) ? t2 : 0; + + c = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? + (t1<<16) | t2 : (t2<<16) | t1; + + hash_v3_mix32 (a, b, c); + hash_v3_finalize32 (a, b, c); + + return c; +} + +#endif /* included_ip_ip4_h */ diff --git a/vnet/vnet/ip/ip46_cli.c b/vnet/vnet/ip/ip46_cli.c new file mode 100644 index 00000000000..44dde9bf3e7 --- /dev/null +++ b/vnet/vnet/ip/ip46_cli.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4_cli.c: ip4 commands + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +int ip4_address_compare (ip4_address_t * a1, ip4_address_t * a2) +{ return clib_net_to_host_u32 (a1->data_u32) - clib_net_to_host_u32 (a2->data_u32); } + +int ip6_address_compare (ip6_address_t * a1, ip6_address_t * a2) +{ + int i; + for (i = 0; i < ARRAY_LEN (a1->as_u16); i++) + { + int cmp = clib_net_to_host_u16 (a1->as_u16[i]) - clib_net_to_host_u16 (a2->as_u16[i]); + if (cmp != 0) + return cmp; + } + return 0; +} + +VLIB_CLI_COMMAND (set_interface_ip_command, static) = { + .path = "set interface ip", + .short_help = "IP4/IP6 commands", +}; + +void ip_del_all_interface_addresses (vlib_main_t *vm, u32 sw_if_index) +{ + ip4_main_t * im4 = &ip4_main; + ip4_address_t * ip4_addrs = 0; + u32 *ip4_masks = 0; + ip6_main_t * im6 = &ip6_main; + ip6_address_t * ip6_addrs = 0; + u32 *ip6_masks = 0; + ip_interface_address_t * ia; + int i; + + foreach_ip_interface_address (&im4->lookup_main, ia, sw_if_index, + 0 /* honor unnumbered */, + ({ + ip4_address_t * x = (ip4_address_t *) + ip_interface_address_get_address (&im4->lookup_main, ia); + vec_add1 (ip4_addrs, x[0]); + vec_add1 (ip4_masks, ia->address_length); + })); + + foreach_ip_interface_address (&im6->lookup_main, ia, sw_if_index, + 0 /* honor unnumbered */, + ({ + ip6_address_t * x = (ip6_address_t *) + ip_interface_address_get_address (&im6->lookup_main, ia); + vec_add1 (ip6_addrs, x[0]); + vec_add1 (ip6_masks, ia->address_length); + })); + + for (i = 0; i < vec_len (ip4_addrs); i++) + ip4_add_del_interface_address (vm, sw_if_index, &ip4_addrs[i], + ip4_masks[i], 1 /* is_del */); + for (i = 0; i < vec_len (ip6_addrs); i++) + ip6_add_del_interface_address (vm, sw_if_index, &ip6_addrs[i], + ip6_masks[i], 1 /* is_del */); + + vec_free (ip4_addrs); + vec_free (ip4_masks); + vec_free (ip6_addrs); + vec_free (ip6_masks); +} + +static clib_error_t * +add_del_ip_address (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + ip4_address_t a4; + ip6_address_t a6; + clib_error_t * error = 0; + u32 sw_if_index, length, is_del; + + sw_if_index = ~0; + is_del = 0; + + if (unformat (input, "del")) + is_del = 1; + + if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + error = clib_error_return (0, "unknown interface `%U'", + format_unformat_error, input); + goto done; + } + + if (is_del && unformat (input, "all")) + ip_del_all_interface_addresses (vm, sw_if_index); + else if (unformat (input, "%U/%d", unformat_ip4_address, &a4, &length)) + error = ip4_add_del_interface_address (vm, sw_if_index, &a4, length, + is_del); + else if (unformat (input, "%U/%d", unformat_ip6_address, &a6, &length)) + error = ip6_add_del_interface_address (vm, sw_if_index, &a6, length, + is_del); + else + { + error = clib_error_return (0, "expected IP4/IP6 address/length `%U'", + format_unformat_error, input); + goto done; + } + + + done: + return error; +} + +VLIB_CLI_COMMAND (set_interface_ip_address_command, static) = { + .path = "set interface ip address", + .function = add_del_ip_address, + .short_help = "Add/delete IP4/IP6 address for interface", +}; + +/* Dummy init function to get us linked in. */ +static clib_error_t * ip4_cli_init (vlib_main_t * vm) +{ return 0; } + +VLIB_INIT_FUNCTION (ip4_cli_init); diff --git a/vnet/vnet/ip/ip4_error.h b/vnet/vnet/ip/ip4_error.h new file mode 100644 index 00000000000..b84b082b993 --- /dev/null +++ b/vnet/vnet/ip/ip4_error.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4_error.h: ip4 fast path errors + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip_ip4_error_h +#define included_ip_ip4_error_h + +#define foreach_ip4_error \ + /* Must be first. */ \ + _ (NONE, "valid ip4 packets") \ + \ + /* Errors signalled by ip4-input */ \ + _ (TOO_SHORT, "ip4 length < 20 bytes") \ + _ (BAD_LENGTH, "ip4 length > l2 length") \ + _ (BAD_CHECKSUM, "bad ip4 checksum") \ + _ (VERSION, "ip4 version != 4") \ + _ (OPTIONS, "ip4 options present") \ + _ (FRAGMENT_OFFSET_ONE, "ip4 fragment offset == 1") \ + _ (TIME_EXPIRED, "ip4 ttl <= 1") \ + \ + /* Errors signalled by ip4-rewrite. */ \ + _ (MTU_EXCEEDED, "ip4 MTU exceeded and DF set") \ + _ (DST_LOOKUP_MISS, "ip4 destination lookup miss") \ + _ (SRC_LOOKUP_MISS, "ip4 source lookup miss") \ + _ (ADJACENCY_DROP, "ip4 adjacency drop") \ + _ (ADJACENCY_PUNT, "ip4 adjacency punt") \ + \ + /* Errors signalled by ip4-local. */ \ + _ (UNKNOWN_PROTOCOL, "unknown ip protocol") \ + _ (TCP_CHECKSUM, "bad tcp checksum") \ + _ (UDP_CHECKSUM, "bad udp checksum") \ + _ (UDP_LENGTH, "inconsistent udp/ip lengths") \ + \ + /* Errors signalled by ip4-source-check. */ \ + _ (UNICAST_SOURCE_CHECK_FAILS, "ip4 unicast source check fails") \ + \ + /* Spoofed packets in ip4-rewrite-local */ \ + _(SPOOFED_LOCAL_PACKETS, "ip4 spoofed local-address packet drops") \ + \ + /* Erros singalled by ip4-inacl */ \ + _ (INACL_TABLE_MISS, "input ACL table-miss drops") \ + _ (INACL_SESSION_DENY, "input ACL session deny drops") + +typedef enum { +#define _(sym,str) IP4_ERROR_##sym, + foreach_ip4_error +#undef _ + IP4_N_ERROR, +} ip4_error_t; + +#endif /* included_ip_ip4_error_h */ diff --git a/vnet/vnet/ip/ip4_format.c b/vnet/vnet/ip/ip4_format.c new file mode 100644 index 00000000000..5f4f8e3667d --- /dev/null +++ b/vnet/vnet/ip/ip4_format.c @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4_format.c: ip4 formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +/* Format an IP4 address. */ +u8 * format_ip4_address (u8 * s, va_list * args) +{ + u8 * a = va_arg (*args, u8 *); + return format (s, "%d.%d.%d.%d", a[0], a[1], a[2], a[3]); +} + +/* Format an IP4 route destination and length. */ +u8 * format_ip4_address_and_length (u8 * s, va_list * args) +{ + u8 * a = va_arg (*args, u8 *); + u8 l = va_arg (*args, u32); + return format (s, "%U/%d", format_ip4_address, a, l); +} + +/* Parse an IP4 address %d.%d.%d.%d. */ +uword unformat_ip4_address (unformat_input_t * input, va_list * args) +{ + u8 * result = va_arg (*args, u8 *); + unsigned a[4]; + + if (! unformat (input, "%d.%d.%d.%d", &a[0], &a[1], &a[2], &a[3])) + return 0; + + if (a[0] >= 256 || a[1] >= 256 || a[2] >= 256 || a[3] >= 256) + return 0; + + result[0] = a[0]; + result[1] = a[1]; + result[2] = a[2]; + result[3] = a[3]; + + return 1; +} + +/* Format an IP4 header. */ +u8 * format_ip4_header (u8 * s, va_list * args) +{ + ip4_header_t * ip = va_arg (*args, ip4_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + u32 ip_version, header_bytes; + uword indent; + + /* Nothing to do. */ + if (max_header_bytes < sizeof (ip[0])) + return format (s, "IP header truncated"); + + indent = format_get_indent (s); + indent += 2; + + ip_version = (ip->ip_version_and_header_length >> 4); + header_bytes = (ip->ip_version_and_header_length & 0xf) * sizeof (u32); + + s = format (s, "%U: %U -> %U", + format_ip_protocol, ip->protocol, + format_ip4_address, ip->src_address.data, + format_ip4_address, ip->dst_address.data); + + /* Show IP version and header length only with unexpected values. */ + if (ip_version != 4 || header_bytes != sizeof (ip4_header_t)) + s = format (s, "\n%Uversion %d, header length %d", + format_white_space, indent, + ip_version, header_bytes); + + s = format (s, "\n%Utos 0x%02x, ttl %d, length %d, checksum 0x%04x", + format_white_space, indent, + ip->tos, ip->ttl, + clib_net_to_host_u16 (ip->length), + clib_net_to_host_u16 (ip->checksum)); + + /* Check and report invalid checksums. */ + { + u16 c = ip4_header_checksum (ip); + if (c != ip->checksum) + s = format (s, " (should be 0x%04x)", clib_net_to_host_u16 (c)); + } + + { + u32 f = clib_net_to_host_u16 (ip->flags_and_fragment_offset); + u32 o; + + s = format (s, "\n%Ufragment id 0x%04x", + format_white_space, indent, + clib_net_to_host_u16 (ip->fragment_id)); + + /* Fragment offset. */ + o = 8 * (f & 0x1fff); + f ^= o; + if (o != 0) + s = format (s, " offset %d", o); + + if (f != 0) + { + s = format (s, ", flags "); +#define _(l) if (f & IP4_HEADER_FLAG_##l) s = format (s, #l); + _ (MORE_FRAGMENTS); + _ (DONT_FRAGMENT); + _ (CONGESTION); +#undef _ + } + } + + /* Recurse into next protocol layer. */ + if (max_header_bytes != 0 && header_bytes < max_header_bytes) + { + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi = ip_get_protocol_info (im, ip->protocol); + + if (pi && pi->format_header) + s = format (s, "\n%U%U", + format_white_space, indent - 2, + pi->format_header, + /* next protocol header */ (void*) ip + header_bytes, + max_header_bytes - header_bytes); + } + + return s; +} + +/* Parse an IP4 header. */ +uword unformat_ip4_header (unformat_input_t * input, va_list * args) +{ + u8 ** result = va_arg (*args, u8 **); + ip4_header_t * ip; + int old_length; + + /* Allocate space for IP header. */ + { + void * p; + + old_length = vec_len (*result); + vec_add2 (*result, p, sizeof (ip4_header_t)); + ip = p; + } + + memset (ip, 0, sizeof (ip[0])); + ip->ip_version_and_header_length = IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS; + + if (! unformat (input, "%U: %U -> %U", + unformat_ip_protocol, &ip->protocol, + unformat_ip4_address, &ip->src_address, + unformat_ip4_address, &ip->dst_address)) + return 0; + + /* Parse options. */ + while (1) + { + int i, j; + + if (unformat (input, "tos %U", unformat_vlib_number, &i)) + ip->tos = i; + + else if (unformat (input, "ttl %U", unformat_vlib_number, &i)) + ip->ttl = i; + + else if (unformat (input, "fragment id %U offset %U", + unformat_vlib_number, &i, + unformat_vlib_number, &j)) + { + ip->fragment_id = clib_host_to_net_u16 (i); + ip->flags_and_fragment_offset |= + clib_host_to_net_u16 ((i / 8) & 0x1fff); + } + + /* Flags. */ + else if (unformat (input, "mf") || unformat (input, "MF")) + ip->flags_and_fragment_offset |= clib_host_to_net_u16 (IP4_HEADER_FLAG_MORE_FRAGMENTS); + + else if (unformat (input, "df") || unformat (input, "DF")) + ip->flags_and_fragment_offset |= clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT); + + else if (unformat (input, "ce") || unformat (input, "CE")) + ip->flags_and_fragment_offset |= clib_host_to_net_u16 (IP4_HEADER_FLAG_CONGESTION); + + /* Can't parse input: try next protocol level. */ + else + break; + } + + /* Fill in checksum. */ + ip->checksum = ip4_header_checksum (ip); + + /* Recurse into next protocol layer. */ + { + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi = ip_get_protocol_info (im, ip->protocol); + + if (pi && pi->unformat_header) + { + if (! unformat_user (input, pi->unformat_header, result)) + return 0; + + /* Result may have moved. */ + ip = (void *) *result + old_length; + } + } + + /* Fill in IP length. */ + ip->length = clib_host_to_net_u16 (vec_len (*result) - old_length); + + return 1; +} diff --git a/vnet/vnet/ip/ip4_forward.c b/vnet/vnet/ip/ip4_forward.c new file mode 100644 index 00000000000..fd304163a6b --- /dev/null +++ b/vnet/vnet/ip/ip4_forward.c @@ -0,0 +1,3564 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4_forward.c: IP v4 forwarding + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> /* for ethernet_header_t */ +#include <vnet/ethernet/arp_packet.h> /* for ethernet_arp_header_t */ +#include <vnet/ppp/ppp.h> +#include <vnet/srp/srp.h> /* for srp_hw_interface_class */ +#include <vnet/api_errno.h> /* for API error numbers */ + +/* This is really, really simple but stupid fib. */ +u32 +ip4_fib_lookup_with_table (ip4_main_t * im, u32 fib_index, + ip4_address_t * dst, + u32 disable_default_route) +{ + ip_lookup_main_t * lm = &im->lookup_main; + ip4_fib_t * fib = vec_elt_at_index (im->fibs, fib_index); + uword * p, * hash, key; + i32 i, i_min, dst_address, ai; + + i_min = disable_default_route ? 1 : 0; + dst_address = clib_mem_unaligned (&dst->data_u32, u32); + for (i = ARRAY_LEN (fib->adj_index_by_dst_address) - 1; i >= i_min; i--) + { + hash = fib->adj_index_by_dst_address[i]; + if (! hash) + continue; + + key = dst_address & im->fib_masks[i]; + if ((p = hash_get (hash, key)) != 0) + { + ai = p[0]; + goto done; + } + } + + /* Nothing matches in table. */ + ai = lm->miss_adj_index; + + done: + return ai; +} + +static ip4_fib_t * +create_fib_with_table_id (ip4_main_t * im, u32 table_id) +{ + ip4_fib_t * fib; + hash_set (im->fib_index_by_table_id, table_id, vec_len (im->fibs)); + vec_add2 (im->fibs, fib, 1); + fib->table_id = table_id; + fib->index = fib - im->fibs; + fib->flow_hash_config = IP_FLOW_HASH_DEFAULT; + fib->fwd_classify_table_index = ~0; + fib->rev_classify_table_index = ~0; + ip4_mtrie_init (&fib->mtrie); + return fib; +} + +ip4_fib_t * +find_ip4_fib_by_table_index_or_id (ip4_main_t * im, + u32 table_index_or_id, u32 flags) +{ + uword * p, fib_index; + + fib_index = table_index_or_id; + if (! (flags & IP4_ROUTE_FLAG_FIB_INDEX)) + { + p = hash_get (im->fib_index_by_table_id, table_index_or_id); + if (! p) + return create_fib_with_table_id (im, table_index_or_id); + fib_index = p[0]; + } + return vec_elt_at_index (im->fibs, fib_index); +} + +static void +ip4_fib_init_adj_index_by_dst_address (ip_lookup_main_t * lm, + ip4_fib_t * fib, + u32 address_length) +{ + hash_t * h; + uword max_index; + + ASSERT (lm->fib_result_n_bytes >= sizeof (uword)); + lm->fib_result_n_words = round_pow2 (lm->fib_result_n_bytes, sizeof (uword)) / sizeof (uword); + + fib->adj_index_by_dst_address[address_length] = + hash_create (32 /* elts */, lm->fib_result_n_words * sizeof (uword)); + + hash_set_flags (fib->adj_index_by_dst_address[address_length], + HASH_FLAG_NO_AUTO_SHRINK); + + h = hash_header (fib->adj_index_by_dst_address[address_length]); + max_index = (hash_value_bytes (h) / sizeof (fib->new_hash_values[0])) - 1; + + /* Initialize new/old hash value vectors. */ + vec_validate_init_empty (fib->new_hash_values, max_index, ~0); + vec_validate_init_empty (fib->old_hash_values, max_index, ~0); +} + +static void serialize_ip4_address (serialize_main_t * m, va_list * va) +{ + ip4_address_t * a = va_arg (*va, ip4_address_t *); + u8 * p = serialize_get (m, sizeof (a->as_u8)); + memcpy (p, a->as_u8, sizeof (a->as_u8)); +} + +static void unserialize_ip4_address (serialize_main_t * m, va_list * va) +{ + ip4_address_t * a = va_arg (*va, ip4_address_t *); + u8 * p = unserialize_get (m, sizeof (a->as_u8)); + memcpy (a->as_u8, p, sizeof (a->as_u8)); +} + +static void serialize_ip4_address_and_length (serialize_main_t * m, va_list * va) +{ + ip4_address_t * a = va_arg (*va, ip4_address_t *); + u32 l = va_arg (*va, u32); + u32 n_bytes = (l / 8) + ((l % 8) != 0); + u8 * p = serialize_get (m, 1 + n_bytes); + ASSERT (l <= 32); + p[0] = l; + memcpy (p + 1, a->as_u8, n_bytes); +} + +static void unserialize_ip4_address_and_length (serialize_main_t * m, va_list * va) +{ + ip4_address_t * a = va_arg (*va, ip4_address_t *); + u32 * al = va_arg (*va, u32 *); + u8 * p = unserialize_get (m, 1); + u32 l, n_bytes; + + al[0] = l = p[0]; + ASSERT (l <= 32); + n_bytes = (l / 8) + ((l % 8) != 0); + + if (n_bytes) + { + p = unserialize_get (m, n_bytes); + memcpy (a->as_u8, p, n_bytes); + } +} + +static void serialize_ip4_add_del_route_msg (serialize_main_t * m, va_list * va) +{ + ip4_add_del_route_args_t * a = va_arg (*va, ip4_add_del_route_args_t *); + + serialize_likely_small_unsigned_integer (m, a->table_index_or_table_id); + serialize_likely_small_unsigned_integer (m, a->flags); + serialize (m, serialize_ip4_address_and_length, &a->dst_address, a->dst_address_length); + serialize_likely_small_unsigned_integer (m, a->adj_index); + serialize_likely_small_unsigned_integer (m, a->n_add_adj); + if (a->n_add_adj > 0) + serialize (m, serialize_vec_ip_adjacency, a->add_adj, a->n_add_adj); +} + +/* Serialized adjacencies for arp/rewrite do not send graph next_index + since graph hookup is not guaranteed to be the same for both sides + of serialize/unserialize. */ +static void +unserialize_fixup_ip4_rewrite_adjacencies (vlib_main_t * vm, + ip_adjacency_t * adj, + u32 n_adj) +{ + vnet_main_t * vnm = vnet_get_main(); + u32 i, ni, sw_if_index, is_arp; + vnet_hw_interface_t * hw; + + for (i = 0; i < n_adj; i++) + { + switch (adj[i].lookup_next_index) + { + case IP_LOOKUP_NEXT_REWRITE: + case IP_LOOKUP_NEXT_ARP: + is_arp = adj[i].lookup_next_index == IP_LOOKUP_NEXT_ARP; + sw_if_index = adj[i].rewrite_header.sw_if_index; + hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + ni = is_arp ? ip4_arp_node.index : ip4_rewrite_node.index; + adj[i].rewrite_header.node_index = ni; + adj[i].rewrite_header.next_index = vlib_node_add_next (vm, ni, hw->output_node_index); + if (is_arp) + vnet_rewrite_for_sw_interface + (vnm, + VNET_L3_PACKET_TYPE_ARP, + sw_if_index, + ni, + VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST, + &adj[i].rewrite_header, + sizeof (adj->rewrite_data)); + break; + + default: + break; + } + } +} + +static void unserialize_ip4_add_del_route_msg (serialize_main_t * m, va_list * va) +{ + ip4_main_t * i4m = &ip4_main; + ip4_add_del_route_args_t a; + + a.table_index_or_table_id = unserialize_likely_small_unsigned_integer (m); + a.flags = unserialize_likely_small_unsigned_integer (m); + unserialize (m, unserialize_ip4_address_and_length, &a.dst_address, &a.dst_address_length); + a.adj_index = unserialize_likely_small_unsigned_integer (m); + a.n_add_adj = unserialize_likely_small_unsigned_integer (m); + a.add_adj = 0; + if (a.n_add_adj > 0) + { + vec_resize (a.add_adj, a.n_add_adj); + unserialize (m, unserialize_vec_ip_adjacency, a.add_adj, a.n_add_adj); + unserialize_fixup_ip4_rewrite_adjacencies (vlib_get_main(), + a.add_adj, a.n_add_adj); + } + + /* Prevent re-re-distribution. */ + a.flags |= IP4_ROUTE_FLAG_NO_REDISTRIBUTE; + + ip4_add_del_route (i4m, &a); + + vec_free (a.add_adj); +} + +MC_SERIALIZE_MSG (ip4_add_del_route_msg, static) = { + .name = "vnet_ip4_add_del_route", + .serialize = serialize_ip4_add_del_route_msg, + .unserialize = unserialize_ip4_add_del_route_msg, +}; + +static void +ip4_fib_set_adj_index (ip4_main_t * im, + ip4_fib_t * fib, + u32 flags, + u32 dst_address_u32, + u32 dst_address_length, + u32 adj_index) +{ + ip_lookup_main_t * lm = &im->lookup_main; + uword * hash; + + if (vec_bytes(fib->old_hash_values)) + memset (fib->old_hash_values, ~0, vec_bytes (fib->old_hash_values)); + if (vec_bytes(fib->new_hash_values)) + memset (fib->new_hash_values, ~0, vec_bytes (fib->new_hash_values)); + fib->new_hash_values[0] = adj_index; + + /* Make sure adj index is valid. */ + if (CLIB_DEBUG > 0) + (void) ip_get_adjacency (lm, adj_index); + + hash = fib->adj_index_by_dst_address[dst_address_length]; + + hash = _hash_set3 (hash, dst_address_u32, + fib->new_hash_values, + fib->old_hash_values); + + fib->adj_index_by_dst_address[dst_address_length] = hash; + + if (vec_len (im->add_del_route_callbacks) > 0) + { + ip4_add_del_route_callback_t * cb; + ip4_address_t d; + uword * p; + + d.data_u32 = dst_address_u32; + vec_foreach (cb, im->add_del_route_callbacks) + if ((flags & cb->required_flags) == cb->required_flags) + cb->function (im, cb->function_opaque, + fib, flags, + &d, dst_address_length, + fib->old_hash_values, + fib->new_hash_values); + + p = hash_get (hash, dst_address_u32); + memcpy (p, fib->new_hash_values, vec_bytes (fib->new_hash_values)); + } +} + +void ip4_add_del_route (ip4_main_t * im, ip4_add_del_route_args_t * a) +{ + vlib_main_t * vm = vlib_get_main(); + ip_lookup_main_t * lm = &im->lookup_main; + ip4_fib_t * fib; + u32 dst_address, dst_address_length, adj_index, old_adj_index; + uword * hash, is_del; + ip4_add_del_route_callback_t * cb; + + if (vm->mc_main && ! (a->flags & IP4_ROUTE_FLAG_NO_REDISTRIBUTE)) + { + u32 multiple_messages_per_vlib_buffer = (a->flags & IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP); + mc_serialize2 (vm->mc_main, multiple_messages_per_vlib_buffer, + &ip4_add_del_route_msg, a); + return; + } + + /* Either create new adjacency or use given one depending on arguments. */ + if (a->n_add_adj > 0) + { + ip_add_adjacency (lm, a->add_adj, a->n_add_adj, &adj_index); + ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0); + } + else + adj_index = a->adj_index; + + dst_address = a->dst_address.data_u32; + dst_address_length = a->dst_address_length; + fib = find_ip4_fib_by_table_index_or_id (im, a->table_index_or_table_id, a->flags); + + ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks)); + dst_address &= im->fib_masks[dst_address_length]; + + if (! fib->adj_index_by_dst_address[dst_address_length]) + ip4_fib_init_adj_index_by_dst_address (lm, fib, dst_address_length); + + hash = fib->adj_index_by_dst_address[dst_address_length]; + + is_del = (a->flags & IP4_ROUTE_FLAG_DEL) != 0; + + if (is_del) + { + fib->old_hash_values[0] = ~0; + hash = _hash_unset (hash, dst_address, fib->old_hash_values); + fib->adj_index_by_dst_address[dst_address_length] = hash; + + if (vec_len (im->add_del_route_callbacks) > 0 + && fib->old_hash_values[0] != ~0) /* make sure destination was found in hash */ + { + fib->new_hash_values[0] = ~0; + vec_foreach (cb, im->add_del_route_callbacks) + if ((a->flags & cb->required_flags) == cb->required_flags) + cb->function (im, cb->function_opaque, + fib, a->flags, + &a->dst_address, dst_address_length, + fib->old_hash_values, + fib->new_hash_values); + } + } + else + ip4_fib_set_adj_index (im, fib, a->flags, dst_address, dst_address_length, + adj_index); + + old_adj_index = fib->old_hash_values[0]; + + ip4_fib_mtrie_add_del_route (fib, a->dst_address, dst_address_length, + is_del ? old_adj_index : adj_index, + is_del); + + /* Delete old adjacency index if present and changed. */ + if (! (a->flags & IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY) + && old_adj_index != ~0 + && old_adj_index != adj_index) + ip_del_adjacency (lm, old_adj_index); +} + +static void serialize_ip4_add_del_route_next_hop_msg (serialize_main_t * m, va_list * va) +{ + u32 flags = va_arg (*va, u32); + ip4_address_t * dst_address = va_arg (*va, ip4_address_t *); + u32 dst_address_length = va_arg (*va, u32); + ip4_address_t * next_hop_address = va_arg (*va, ip4_address_t *); + u32 next_hop_sw_if_index = va_arg (*va, u32); + u32 next_hop_weight = va_arg (*va, u32); + + serialize_likely_small_unsigned_integer (m, flags); + serialize (m, serialize_ip4_address_and_length, dst_address, dst_address_length); + serialize (m, serialize_ip4_address, next_hop_address); + serialize_likely_small_unsigned_integer (m, next_hop_sw_if_index); + serialize_likely_small_unsigned_integer (m, next_hop_weight); +} + +static void unserialize_ip4_add_del_route_next_hop_msg (serialize_main_t * m, va_list * va) +{ + ip4_main_t * im = &ip4_main; + u32 flags, dst_address_length, next_hop_sw_if_index, next_hop_weight; + ip4_address_t dst_address, next_hop_address; + + flags = unserialize_likely_small_unsigned_integer (m); + unserialize (m, unserialize_ip4_address_and_length, &dst_address, &dst_address_length); + unserialize (m, unserialize_ip4_address, &next_hop_address); + next_hop_sw_if_index = unserialize_likely_small_unsigned_integer (m); + next_hop_weight = unserialize_likely_small_unsigned_integer (m); + + ip4_add_del_route_next_hop + (im, + flags | IP4_ROUTE_FLAG_NO_REDISTRIBUTE, + &dst_address, + dst_address_length, + &next_hop_address, + next_hop_sw_if_index, + next_hop_weight, (u32)~0, + (u32)~0 /* explicit FIB index */); +} + +MC_SERIALIZE_MSG (ip4_add_del_route_next_hop_msg, static) = { + .name = "vnet_ip4_add_del_route_next_hop", + .serialize = serialize_ip4_add_del_route_next_hop_msg, + .unserialize = unserialize_ip4_add_del_route_next_hop_msg, +}; + +void +ip4_add_del_route_next_hop (ip4_main_t * im, + u32 flags, + ip4_address_t * dst_address, + u32 dst_address_length, + ip4_address_t * next_hop, + u32 next_hop_sw_if_index, + u32 next_hop_weight, u32 adj_index, + u32 explicit_fib_index) +{ + vnet_main_t * vnm = vnet_get_main(); + vlib_main_t * vm = vlib_get_main(); + ip_lookup_main_t * lm = &im->lookup_main; + u32 fib_index; + ip4_fib_t * fib; + u32 dst_address_u32, old_mp_adj_index, new_mp_adj_index; + u32 dst_adj_index, nh_adj_index; + uword * dst_hash, * dst_result; + uword * nh_hash, * nh_result; + ip_adjacency_t * dst_adj; + ip_multipath_adjacency_t * old_mp, * new_mp; + int is_del = (flags & IP4_ROUTE_FLAG_DEL) != 0; + int is_interface_next_hop; + clib_error_t * error = 0; + + if (vm->mc_main && ! (flags & IP4_ROUTE_FLAG_NO_REDISTRIBUTE)) + { + u32 multiple_messages_per_vlib_buffer = (flags & IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP); + mc_serialize2 (vm->mc_main, + multiple_messages_per_vlib_buffer, + &ip4_add_del_route_next_hop_msg, + flags, + dst_address, dst_address_length, + next_hop, next_hop_sw_if_index, next_hop_weight); + return; + } + + if (explicit_fib_index == (u32)~0) + fib_index = vec_elt (im->fib_index_by_sw_if_index, next_hop_sw_if_index); + else + fib_index = explicit_fib_index; + + fib = vec_elt_at_index (im->fibs, fib_index); + + /* Lookup next hop to be added or deleted. */ + is_interface_next_hop = next_hop->data_u32 == 0; + if (adj_index == (u32)~0) + { + if (is_interface_next_hop) + { + nh_result = hash_get (im->interface_route_adj_index_by_sw_if_index, next_hop_sw_if_index); + if (nh_result) + nh_adj_index = *nh_result; + else + { + ip_adjacency_t * adj; + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &nh_adj_index); + ip4_adjacency_set_interface_route (vnm, adj, next_hop_sw_if_index, /* if_address_index */ ~0); + ip_call_add_del_adjacency_callbacks (lm, nh_adj_index, /* is_del */ 0); + hash_set (im->interface_route_adj_index_by_sw_if_index, next_hop_sw_if_index, nh_adj_index); + } + } + else + { + nh_hash = fib->adj_index_by_dst_address[32]; + nh_result = hash_get (nh_hash, next_hop->data_u32); + + /* Next hop must be known. */ + if (! nh_result) + { + vnm->api_errno = VNET_API_ERROR_NEXT_HOP_NOT_IN_FIB; + error = clib_error_return (0, "next-hop %U/32 not in FIB", + format_ip4_address, next_hop); + goto done; + } + nh_adj_index = *nh_result; + } + } + else + { + nh_adj_index = adj_index; + } + ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks)); + dst_address_u32 = dst_address->data_u32 & im->fib_masks[dst_address_length]; + + dst_hash = fib->adj_index_by_dst_address[dst_address_length]; + dst_result = hash_get (dst_hash, dst_address_u32); + if (dst_result) + { + dst_adj_index = dst_result[0]; + dst_adj = ip_get_adjacency (lm, dst_adj_index); + } + else + { + /* For deletes destination must be known. */ + if (is_del) + { + vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION; + error = clib_error_return (0, "unknown destination %U/%d", + format_ip4_address, dst_address, + dst_address_length); + goto done; + } + + dst_adj_index = ~0; + dst_adj = 0; + } + + /* Ignore adds of X/32 with next hop of X. */ + if (! is_del + && dst_address_length == 32 + && dst_address->data_u32 == next_hop->data_u32 + && adj_index != (u32)~0) + { + vnm->api_errno = VNET_API_ERROR_PREFIX_MATCHES_NEXT_HOP; + error = clib_error_return (0, "prefix matches next hop %U/%d", + format_ip4_address, dst_address, + dst_address_length); + goto done; + } + + old_mp_adj_index = dst_adj ? dst_adj->heap_handle : ~0; + + if (! ip_multipath_adjacency_add_del_next_hop + (lm, is_del, + old_mp_adj_index, + nh_adj_index, + next_hop_weight, + &new_mp_adj_index)) + { + vnm->api_errno = VNET_API_ERROR_NEXT_HOP_NOT_FOUND_MP; + error = clib_error_return (0, "requested deleting next-hop %U not found in multi-path", + format_ip4_address, next_hop); + goto done; + } + + old_mp = new_mp = 0; + if (old_mp_adj_index != ~0) + old_mp = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index); + if (new_mp_adj_index != ~0) + new_mp = vec_elt_at_index (lm->multipath_adjacencies, new_mp_adj_index); + + if (old_mp != new_mp) + { + ip4_add_del_route_args_t a; + a.table_index_or_table_id = fib_index; + a.flags = ((is_del && ! new_mp ? IP4_ROUTE_FLAG_DEL : IP4_ROUTE_FLAG_ADD) + | IP4_ROUTE_FLAG_FIB_INDEX + | IP4_ROUTE_FLAG_KEEP_OLD_ADJACENCY + | (flags & (IP4_ROUTE_FLAG_NO_REDISTRIBUTE | IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP))); + a.dst_address = dst_address[0]; + a.dst_address_length = dst_address_length; + a.adj_index = new_mp ? new_mp->adj_index : dst_adj_index; + a.add_adj = 0; + a.n_add_adj = 0; + + ip4_add_del_route (im, &a); + } + + done: + if (error) + clib_error_report (error); +} + +void * +ip4_get_route (ip4_main_t * im, + u32 table_index_or_table_id, + u32 flags, + u8 * address, + u32 address_length) +{ + ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags); + u32 dst_address = * (u32 *) address; + uword * hash, * p; + + ASSERT (address_length < ARRAY_LEN (im->fib_masks)); + dst_address &= im->fib_masks[address_length]; + + hash = fib->adj_index_by_dst_address[address_length]; + p = hash_get (hash, dst_address); + return (void *) p; +} + +void +ip4_foreach_matching_route (ip4_main_t * im, + u32 table_index_or_table_id, + u32 flags, + ip4_address_t * address, + u32 address_length, + ip4_address_t ** results, + u8 ** result_lengths) +{ + ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags); + u32 dst_address = address->data_u32; + u32 this_length = address_length; + + if (*results) + _vec_len (*results) = 0; + if (*result_lengths) + _vec_len (*result_lengths) = 0; + + while (this_length <= 32 && vec_len (results) == 0) + { + uword k, v; + hash_foreach (k, v, fib->adj_index_by_dst_address[this_length], ({ + if (0 == ((k ^ dst_address) & im->fib_masks[address_length])) + { + ip4_address_t a; + a.data_u32 = k; + vec_add1 (*results, a); + vec_add1 (*result_lengths, this_length); + } + })); + + this_length++; + } +} + +void ip4_maybe_remap_adjacencies (ip4_main_t * im, + u32 table_index_or_table_id, + u32 flags) +{ + ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_index_or_table_id, flags); + ip_lookup_main_t * lm = &im->lookup_main; + u32 i, l; + ip4_address_t a; + ip4_add_del_route_callback_t * cb; + static ip4_address_t * to_delete; + + if (lm->n_adjacency_remaps == 0) + return; + + for (l = 0; l <= 32; l++) + { + hash_pair_t * p; + uword * hash = fib->adj_index_by_dst_address[l]; + + if (hash_elts (hash) == 0) + continue; + + if (to_delete) + _vec_len (to_delete) = 0; + + hash_foreach_pair (p, hash, ({ + u32 adj_index = p->value[0]; + u32 m = vec_elt (lm->adjacency_remap_table, adj_index); + + if (m) + { + /* Record destination address from hash key. */ + a.data_u32 = p->key; + + /* New adjacency points to nothing: so delete prefix. */ + if (m == ~0) + vec_add1 (to_delete, a); + else + { + /* Remap to new adjacency. */ + memcpy (fib->old_hash_values, p->value, vec_bytes (fib->old_hash_values)); + + /* Set new adjacency value. */ + fib->new_hash_values[0] = p->value[0] = m - 1; + + vec_foreach (cb, im->add_del_route_callbacks) + if ((flags & cb->required_flags) == cb->required_flags) + cb->function (im, cb->function_opaque, + fib, flags | IP4_ROUTE_FLAG_ADD, + &a, l, + fib->old_hash_values, + fib->new_hash_values); + } + } + })); + + fib->new_hash_values[0] = ~0; + for (i = 0; i < vec_len (to_delete); i++) + { + hash = _hash_unset (hash, to_delete[i].data_u32, fib->old_hash_values); + vec_foreach (cb, im->add_del_route_callbacks) + if ((flags & cb->required_flags) == cb->required_flags) + cb->function (im, cb->function_opaque, + fib, flags | IP4_ROUTE_FLAG_DEL, + &a, l, + fib->old_hash_values, + fib->new_hash_values); + } + } + + /* Also remap adjacencies in mtrie. */ + ip4_mtrie_maybe_remap_adjacencies (lm, &fib->mtrie); + + /* Reset mapping table. */ + vec_zero (lm->adjacency_remap_table); + + /* All remaps have been performed. */ + lm->n_adjacency_remaps = 0; +} + +void ip4_delete_matching_routes (ip4_main_t * im, + u32 table_index_or_table_id, + u32 flags, + ip4_address_t * address, + u32 address_length) +{ + static ip4_address_t * matching_addresses; + static u8 * matching_address_lengths; + u32 l, i; + ip4_add_del_route_args_t a; + + a.flags = IP4_ROUTE_FLAG_DEL | IP4_ROUTE_FLAG_NO_REDISTRIBUTE | flags; + a.table_index_or_table_id = table_index_or_table_id; + a.adj_index = ~0; + a.add_adj = 0; + a.n_add_adj = 0; + + for (l = address_length + 1; l <= 32; l++) + { + ip4_foreach_matching_route (im, table_index_or_table_id, flags, + address, + l, + &matching_addresses, + &matching_address_lengths); + for (i = 0; i < vec_len (matching_addresses); i++) + { + a.dst_address = matching_addresses[i]; + a.dst_address_length = matching_address_lengths[i]; + ip4_add_del_route (im, &a); + } + } + + ip4_maybe_remap_adjacencies (im, table_index_or_table_id, flags); +} + +always_inline uword +ip4_lookup_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + int lookup_for_responses_to_locally_received_packets) +{ + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters; + u32 n_left_from, n_left_to_next, * from, * to_next; + ip_lookup_next_t next; + u32 cpu_index = os_get_cpu_number(); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next = node->cached_next_index; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t * p0, * p1; + ip4_header_t * ip0, * ip1; + __attribute__((unused)) tcp_header_t * tcp0, * tcp1; + ip_lookup_next_t next0, next1; + ip_adjacency_t * adj0, * adj1; + ip4_fib_mtrie_t * mtrie0, * mtrie1; + ip4_fib_mtrie_leaf_t leaf0, leaf1; + __attribute__((unused)) u32 pi0, fib_index0, adj_index0, is_tcp_udp0; + __attribute__((unused)) u32 pi1, fib_index1, adj_index1, is_tcp_udp1; + u32 flow_hash_config0, flow_hash_config1; + u32 hash_c0, hash_c1; + u32 wrong_next; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]); + fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]); + fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ? + fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX]; + fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ? + fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX]; + + + if (! lookup_for_responses_to_locally_received_packets) + { + mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie; + mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie; + + leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 0); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->dst_address, 0); + } + + tcp0 = (void *) (ip0 + 1); + tcp1 = (void *) (ip1 + 1); + + is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP + || ip0->protocol == IP_PROTOCOL_UDP); + is_tcp_udp1 = (ip1->protocol == IP_PROTOCOL_TCP + || ip1->protocol == IP_PROTOCOL_UDP); + + if (! lookup_for_responses_to_locally_received_packets) + { + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 1); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->dst_address, 1); + } + + if (! lookup_for_responses_to_locally_received_packets) + { + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 2); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->dst_address, 2); + } + + if (! lookup_for_responses_to_locally_received_packets) + { + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 3); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->dst_address, 3); + } + + if (lookup_for_responses_to_locally_received_packets) + { + adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX]; + adj_index1 = vnet_buffer (p1)->ip.adj_index[VLIB_RX]; + } + else + { + /* Handle default route. */ + leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0); + leaf1 = (leaf1 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie1->default_leaf : leaf1); + + adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1); + } + + ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0, + &ip0->dst_address, + /* no_default_route */ 0)); + ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1, + &ip1->dst_address, + /* no_default_route */ 0)); + adj0 = ip_get_adjacency (lm, adj_index0); + adj1 = ip_get_adjacency (lm, adj_index1); + + next0 = adj0->lookup_next_index; + next1 = adj1->lookup_next_index; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0; + hash_c1 = vnet_buffer (p1)->ip.flow_hash = 0; + if (PREDICT_FALSE (adj0->n_adj > 1)) + { + flow_hash_config0 = + vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config; + hash_c0 = vnet_buffer (p0)->ip.flow_hash = + ip4_compute_flow_hash (ip0, flow_hash_config0); + } + if (PREDICT_FALSE(adj1->n_adj > 1)) + { + flow_hash_config1 = + vec_elt_at_index (im->fibs, fib_index1)->flow_hash_config; + hash_c1 = vnet_buffer (p1)->ip.flow_hash = + ip4_compute_flow_hash (ip1, flow_hash_config1); + } + + ASSERT (adj0->n_adj > 0); + ASSERT (adj1->n_adj > 0); + ASSERT (is_pow2 (adj0->n_adj)); + ASSERT (is_pow2 (adj1->n_adj)); + adj_index0 += (hash_c0 & (adj0->n_adj - 1)); + adj_index1 += (hash_c1 & (adj1->n_adj - 1)); + + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0; + vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1; + + vlib_increment_combined_counter + (cm, cpu_index, adj_index0, 1, + vlib_buffer_length_in_chain (vm, p0) + + sizeof(ethernet_header_t)); + vlib_increment_combined_counter + (cm, cpu_index, adj_index1, 1, + vlib_buffer_length_in_chain (vm, p1) + + sizeof(ethernet_header_t)); + + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + wrong_next = (next0 != next) + 2*(next1 != next); + if (PREDICT_FALSE (wrong_next != 0)) + { + switch (wrong_next) + { + case 1: + /* A B A */ + to_next[-2] = pi1; + to_next -= 1; + n_left_to_next += 1; + vlib_set_next_frame_buffer (vm, node, next0, pi0); + break; + + case 2: + /* A A B */ + to_next -= 1; + n_left_to_next += 1; + vlib_set_next_frame_buffer (vm, node, next1, pi1); + break; + + case 3: + /* A B C */ + to_next -= 2; + n_left_to_next += 2; + vlib_set_next_frame_buffer (vm, node, next0, pi0); + vlib_set_next_frame_buffer (vm, node, next1, pi1); + if (next0 == next1) + { + /* A B B */ + vlib_put_next_frame (vm, node, next, n_left_to_next); + next = next1; + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + } + } + } + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + __attribute__((unused)) tcp_header_t * tcp0; + ip_lookup_next_t next0; + ip_adjacency_t * adj0; + ip4_fib_mtrie_t * mtrie0; + ip4_fib_mtrie_leaf_t leaf0; + __attribute__((unused)) u32 pi0, fib_index0, adj_index0, is_tcp_udp0; + u32 flow_hash_config0, hash_c0; + + pi0 = from[0]; + to_next[0] = pi0; + + p0 = vlib_get_buffer (vm, pi0); + + ip0 = vlib_buffer_get_current (p0); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]); + fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ? + fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX]; + + if (! lookup_for_responses_to_locally_received_packets) + { + mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie; + + leaf0 = IP4_FIB_MTRIE_LEAF_ROOT; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 0); + } + + tcp0 = (void *) (ip0 + 1); + + is_tcp_udp0 = (ip0->protocol == IP_PROTOCOL_TCP + || ip0->protocol == IP_PROTOCOL_UDP); + + if (! lookup_for_responses_to_locally_received_packets) + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 1); + + if (! lookup_for_responses_to_locally_received_packets) + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 2); + + if (! lookup_for_responses_to_locally_received_packets) + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->dst_address, 3); + + if (lookup_for_responses_to_locally_received_packets) + adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_RX]; + else + { + /* Handle default route. */ + leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0); + adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + } + + ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0, + &ip0->dst_address, + /* no_default_route */ 0)); + + adj0 = ip_get_adjacency (lm, adj_index0); + + next0 = adj0->lookup_next_index; + + /* Use flow hash to compute multipath adjacency. */ + hash_c0 = vnet_buffer (p0)->ip.flow_hash = 0; + if (PREDICT_FALSE(adj0->n_adj > 1)) + { + flow_hash_config0 = + vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config; + + hash_c0 = vnet_buffer (p0)->ip.flow_hash = + ip4_compute_flow_hash (ip0, flow_hash_config0); + } + + ASSERT (adj0->n_adj > 0); + ASSERT (is_pow2 (adj0->n_adj)); + adj_index0 += (hash_c0 & (adj0->n_adj - 1)); + + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0; + + vlib_increment_combined_counter + (cm, cpu_index, adj_index0, 1, + vlib_buffer_length_in_chain (vm, p0) + + sizeof(ethernet_header_t)); + + from += 1; + to_next += 1; + n_left_to_next -= 1; + n_left_from -= 1; + + if (PREDICT_FALSE (next0 != next)) + { + n_left_to_next += 1; + vlib_put_next_frame (vm, node, next, n_left_to_next); + next = next0; + vlib_get_next_frame (vm, node, next, + to_next, n_left_to_next); + to_next[0] = pi0; + to_next += 1; + n_left_to_next -= 1; + } + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + return frame->n_vectors; +} + +static uword +ip4_lookup (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip4_lookup_inline (vm, node, frame, /* lookup_for_responses_to_locally_received_packets */ 0); + +} + +void ip4_adjacency_set_interface_route (vnet_main_t * vnm, + ip_adjacency_t * adj, + u32 sw_if_index, + u32 if_address_index) +{ + vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + ip_lookup_next_t n; + vnet_l3_packet_type_t packet_type; + u32 node_index; + + if (hw->hw_class_index == ethernet_hw_interface_class.index + || hw->hw_class_index == srp_hw_interface_class.index) + { + /* + * We have a bit of a problem in this case. ip4-arp uses + * the rewrite_header.next_index to hand pkts to the + * indicated inteface output node. We can end up in + * ip4_rewrite_local, too, which also pays attention to + * rewrite_header.next index. Net result: a hack in + * ip4_rewrite_local... + */ + n = IP_LOOKUP_NEXT_ARP; + node_index = ip4_arp_node.index; + adj->if_address_index = if_address_index; + packet_type = VNET_L3_PACKET_TYPE_ARP; + } + else + { + n = IP_LOOKUP_NEXT_REWRITE; + node_index = ip4_rewrite_node.index; + packet_type = VNET_L3_PACKET_TYPE_IP4; + } + + adj->lookup_next_index = n; + vnet_rewrite_for_sw_interface + (vnm, + packet_type, + sw_if_index, + node_index, + VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST, + &adj->rewrite_header, + sizeof (adj->rewrite_data)); +} + +static void +ip4_add_interface_routes (u32 sw_if_index, + ip4_main_t * im, u32 fib_index, + ip_interface_address_t * a) +{ + vnet_main_t * vnm = vnet_get_main(); + ip_lookup_main_t * lm = &im->lookup_main; + ip_adjacency_t * adj; + ip4_address_t * address = ip_interface_address_get_address (lm, a); + ip4_add_del_route_args_t x; + vnet_hw_interface_t * hw_if = vnet_get_sup_hw_interface (vnm, sw_if_index); + u32 classify_table_index; + + /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */ + x.table_index_or_table_id = fib_index; + x.flags = (IP4_ROUTE_FLAG_ADD + | IP4_ROUTE_FLAG_FIB_INDEX + | IP4_ROUTE_FLAG_NO_REDISTRIBUTE); + x.dst_address = address[0]; + x.dst_address_length = a->address_length; + x.n_add_adj = 0; + x.add_adj = 0; + + a->neighbor_probe_adj_index = ~0; + if (a->address_length < 32) + { + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &x.adj_index); + ip4_adjacency_set_interface_route (vnm, adj, sw_if_index, a - lm->if_address_pool); + ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0); + ip4_add_del_route (im, &x); + a->neighbor_probe_adj_index = x.adj_index; + } + + /* Add e.g. 1.1.1.1/32 as local to this host. */ + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &x.adj_index); + + classify_table_index = ~0; + if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index)) + classify_table_index = lm->classify_table_index_by_sw_if_index [sw_if_index]; + if (classify_table_index != (u32) ~0) + { + adj->lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY; + adj->classify_table_index = classify_table_index; + } + else + adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL; + + adj->if_address_index = a - lm->if_address_pool; + adj->rewrite_header.sw_if_index = sw_if_index; + adj->rewrite_header.max_l3_packet_bytes = hw_if->max_l3_packet_bytes[VLIB_RX]; + /* + * Local adjs are never to be rewritten. Spoofed pkts w/ src = dst = local + * fail an RPF-ish check, but still go thru the rewrite code... + */ + adj->rewrite_header.data_bytes = 0; + + ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0); + x.dst_address_length = 32; + ip4_add_del_route (im, &x); +} + +static void +ip4_del_interface_routes (ip4_main_t * im, u32 fib_index, ip4_address_t * address, u32 address_length) +{ + ip4_add_del_route_args_t x; + + /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */ + x.table_index_or_table_id = fib_index; + x.flags = (IP4_ROUTE_FLAG_DEL + | IP4_ROUTE_FLAG_FIB_INDEX + | IP4_ROUTE_FLAG_NO_REDISTRIBUTE); + x.dst_address = address[0]; + x.dst_address_length = address_length; + x.adj_index = ~0; + x.n_add_adj = 0; + x.add_adj = 0; + + if (address_length < 32) + ip4_add_del_route (im, &x); + + x.dst_address_length = 32; + ip4_add_del_route (im, &x); + + ip4_delete_matching_routes (im, + fib_index, + IP4_ROUTE_FLAG_FIB_INDEX, + address, + address_length); +} + +typedef struct { + u32 sw_if_index; + ip4_address_t address; + u32 length; +} ip4_interface_address_t; + +static void serialize_vec_ip4_set_interface_address (serialize_main_t * m, va_list * va) +{ + ip4_interface_address_t * a = va_arg (*va, ip4_interface_address_t *); + u32 n = va_arg (*va, u32); + u32 i; + for (i = 0; i < n; i++) { + serialize_integer (m, a[i].sw_if_index, sizeof (a[i].sw_if_index)); + serialize (m, serialize_ip4_address, &a[i].address); + serialize_integer (m, a[i].length, sizeof (a[i].length)); + } +} + +static void unserialize_vec_ip4_set_interface_address (serialize_main_t * m, va_list * va) +{ + ip4_interface_address_t * a = va_arg (*va, ip4_interface_address_t *); + u32 n = va_arg (*va, u32); + u32 i; + for (i = 0; i < n; i++) { + unserialize_integer (m, &a[i].sw_if_index, sizeof (a[i].sw_if_index)); + unserialize (m, unserialize_ip4_address, &a[i].address); + unserialize_integer (m, &a[i].length, sizeof (a[i].length)); + } +} + +static void serialize_ip4_set_interface_address_msg (serialize_main_t * m, va_list * va) +{ + ip4_interface_address_t * a = va_arg (*va, ip4_interface_address_t *); + int is_del = va_arg (*va, int); + serialize (m, serialize_vec_ip4_set_interface_address, a, 1); + serialize_integer (m, is_del, sizeof (is_del)); +} + +static clib_error_t * +ip4_add_del_interface_address_internal (vlib_main_t * vm, + u32 sw_if_index, + ip4_address_t * new_address, + u32 new_length, + u32 redistribute, + u32 insert_routes, + u32 is_del); + +static void unserialize_ip4_set_interface_address_msg (serialize_main_t * m, va_list * va) +{ + mc_main_t * mcm = va_arg (*va, mc_main_t *); + vlib_main_t * vm = mcm->vlib_main; + ip4_interface_address_t a; + clib_error_t * error; + int is_del; + + unserialize (m, unserialize_vec_ip4_set_interface_address, &a, 1); + unserialize_integer (m, &is_del, sizeof (is_del)); + error = ip4_add_del_interface_address_internal + (vm, a.sw_if_index, &a.address, a.length, + /* redistribute */ 0, + /* insert_routes */ 1, + is_del); + if (error) + clib_error_report (error); +} + +MC_SERIALIZE_MSG (ip4_set_interface_address_msg, static) = { + .name = "vnet_ip4_set_interface_address", + .serialize = serialize_ip4_set_interface_address_msg, + .unserialize = unserialize_ip4_set_interface_address_msg, +}; + +static clib_error_t * +ip4_add_del_interface_address_internal (vlib_main_t * vm, + u32 sw_if_index, + ip4_address_t * address, + u32 address_length, + u32 redistribute, + u32 insert_routes, + u32 is_del) +{ + vnet_main_t * vnm = vnet_get_main(); + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + clib_error_t * error = 0; + u32 if_address_index, elts_before; + ip4_address_fib_t ip4_af, * addr_fib = 0; + + vec_validate (im->fib_index_by_sw_if_index, sw_if_index); + ip4_addr_fib_init (&ip4_af, address, + vec_elt (im->fib_index_by_sw_if_index, sw_if_index)); + vec_add1 (addr_fib, ip4_af); + + /* When adding an address check that it does not conflict with an existing address. */ + if (! is_del) + { + ip_interface_address_t * ia; + foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index, + 0 /* honor unnumbered */, + ({ + ip4_address_t * x = ip_interface_address_get_address (&im->lookup_main, ia); + + if (ip4_destination_matches_route (im, address, x, ia->address_length) + || ip4_destination_matches_route (im, x, address, address_length)) + return clib_error_create ("failed to add %U which conflicts with %U for interface %U", + format_ip4_address_and_length, address, address_length, + format_ip4_address_and_length, x, ia->address_length, + format_vnet_sw_if_index_name, vnm, sw_if_index); + })); + } + + if (vm->mc_main && redistribute) + { + ip4_interface_address_t a; + a.sw_if_index = sw_if_index; + a.address = address[0]; + a.length = address_length; + mc_serialize (vm->mc_main, &ip4_set_interface_address_msg, + &a, (int)is_del); + goto done; + } + + elts_before = pool_elts (lm->if_address_pool); + + error = ip_interface_address_add_del + (lm, + sw_if_index, + addr_fib, + address_length, + is_del, + &if_address_index); + if (error) + goto done; + + if (vnet_sw_interface_is_admin_up (vnm, sw_if_index) && insert_routes) + { + if (is_del) + ip4_del_interface_routes (im, ip4_af.fib_index, address, + address_length); + + else + ip4_add_interface_routes (sw_if_index, + im, ip4_af.fib_index, + pool_elt_at_index + (lm->if_address_pool, if_address_index)); + } + + /* If pool did not grow/shrink: add duplicate address. */ + if (elts_before != pool_elts (lm->if_address_pool)) + { + ip4_add_del_interface_address_callback_t * cb; + vec_foreach (cb, im->add_del_interface_address_callbacks) + cb->function (im, cb->function_opaque, sw_if_index, + address, address_length, + if_address_index, + is_del); + } + + done: + vec_free (addr_fib); + return error; +} + +clib_error_t * +ip4_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index, + ip4_address_t * address, u32 address_length, + u32 is_del) +{ + return ip4_add_del_interface_address_internal + (vm, sw_if_index, address, address_length, + /* redistribute */ 1, + /* insert_routes */ 1, + is_del); +} + +static void serialize_ip4_fib (serialize_main_t * m, va_list * va) +{ + ip4_fib_t * f = va_arg (*va, ip4_fib_t *); + u32 l, dst, adj_index; + + serialize_integer (m, f->table_id, sizeof (f->table_id)); + for (l = 0; l < ARRAY_LEN (f->adj_index_by_dst_address); l++) + { + u32 n_elts = hash_elts (f->adj_index_by_dst_address[l]); + + serialize_integer (m, n_elts, sizeof (n_elts)); + hash_foreach (dst, adj_index, f->adj_index_by_dst_address[l], ({ + ip4_address_t tmp; + tmp.as_u32 = dst; + serialize (m, serialize_ip4_address, &tmp); + serialize_integer (m, adj_index, sizeof (adj_index)); + })); + } +} + +static void unserialize_ip4_fib (serialize_main_t * m, va_list * va) +{ + ip4_add_del_route_args_t a; + u32 i; + + a.flags = (IP4_ROUTE_FLAG_ADD + | IP4_ROUTE_FLAG_NO_REDISTRIBUTE + | IP4_ROUTE_FLAG_TABLE_ID); + a.n_add_adj = 0; + a.add_adj = 0; + + unserialize_integer (m, &a.table_index_or_table_id, + sizeof (a.table_index_or_table_id)); + + for (i = 0; i < STRUCT_ARRAY_LEN (ip4_fib_t, adj_index_by_dst_address); i++) + { + u32 n_elts; + unserialize_integer (m, &n_elts, sizeof (u32)); + a.dst_address_length = i; + while (n_elts > 0) + { + unserialize (m, unserialize_ip4_address, &a.dst_address); + unserialize_integer (m, &a.adj_index, sizeof (a.adj_index)); + ip4_add_del_route (&ip4_main, &a); + n_elts--; + } + } +} + +void serialize_vnet_ip4_main (serialize_main_t * m, va_list * va) +{ + vnet_main_t * vnm = va_arg (*va, vnet_main_t *); + vnet_interface_main_t * vim = &vnm->interface_main; + vnet_sw_interface_t * si; + ip4_main_t * i4m = &ip4_main; + ip4_interface_address_t * as = 0, * a; + + /* Download adjacency tables & multipath stuff. */ + serialize (m, serialize_ip_lookup_main, &i4m->lookup_main); + + /* FIBs. */ + { + ip4_fib_t * f; + u32 n_fibs = vec_len (i4m->fibs); + serialize_integer (m, n_fibs, sizeof (n_fibs)); + vec_foreach (f, i4m->fibs) + serialize (m, serialize_ip4_fib, f); + } + + /* FIB interface config. */ + vec_serialize (m, i4m->fib_index_by_sw_if_index, serialize_vec_32); + + /* Interface ip4 addresses. */ + pool_foreach (si, vim->sw_interfaces, ({ + u32 sw_if_index = si->sw_if_index; + ip_interface_address_t * ia; + foreach_ip_interface_address (&i4m->lookup_main, ia, sw_if_index, + 0 /* honor unnumbered */, + ({ + ip4_address_t * x = ip_interface_address_get_address (&i4m->lookup_main, ia); + vec_add2 (as, a, 1); + a->address = x[0]; + a->length = ia->address_length; + a->sw_if_index = sw_if_index; + })); + })); + vec_serialize (m, as, serialize_vec_ip4_set_interface_address); + vec_free (as); +} + +void unserialize_vnet_ip4_main (serialize_main_t * m, va_list * va) +{ + vlib_main_t * vm = va_arg (*va, vlib_main_t *); + ip4_main_t * i4m = &ip4_main; + ip4_interface_address_t * as = 0, * a; + + unserialize (m, unserialize_ip_lookup_main, &i4m->lookup_main); + + { + ip_adjacency_t * adj, * adj_heap; + u32 n_adj; + adj_heap = i4m->lookup_main.adjacency_heap; + heap_foreach (adj, n_adj, adj_heap, ({ + unserialize_fixup_ip4_rewrite_adjacencies (vm, adj, n_adj); + ip_call_add_del_adjacency_callbacks (&i4m->lookup_main, adj - adj_heap, /* is_del */ 0); + })); + } + + /* FIBs */ + { + u32 i, n_fibs; + unserialize_integer (m, &n_fibs, sizeof (n_fibs)); + for (i = 0; i < n_fibs; i++) + unserialize (m, unserialize_ip4_fib); + } + + vec_unserialize (m, &i4m->fib_index_by_sw_if_index, unserialize_vec_32); + + vec_unserialize (m, &as, unserialize_vec_ip4_set_interface_address); + vec_foreach (a, as) { + ip4_add_del_interface_address_internal + (vm, a->sw_if_index, &a->address, a->length, + /* redistribute */ 0, + /* insert_routes */ 0, + /* is_del */ 0); + } + vec_free (as); +} + +static clib_error_t * +ip4_sw_interface_admin_up_down (vnet_main_t * vnm, + u32 sw_if_index, + u32 flags) +{ + ip4_main_t * im = &ip4_main; + ip_interface_address_t * ia; + ip4_address_t * a; + u32 is_admin_up, fib_index; + + /* Fill in lookup tables with default table (0). */ + vec_validate (im->fib_index_by_sw_if_index, sw_if_index); + + vec_validate_init_empty (im->lookup_main.if_address_pool_index_by_sw_if_index, sw_if_index, ~0); + + is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + + fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index); + + foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index, + 0 /* honor unnumbered */, + ({ + a = ip_interface_address_get_address (&im->lookup_main, ia); + if (is_admin_up) + ip4_add_interface_routes (sw_if_index, + im, fib_index, + ia); + else + ip4_del_interface_routes (im, fib_index, + a, ia->address_length); + })); + + return 0; +} + +VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip4_sw_interface_admin_up_down); + +static clib_error_t * +ip4_sw_interface_add_del (vnet_main_t * vnm, + u32 sw_if_index, + u32 is_add) +{ + vlib_main_t * vm = vnm->vlib_main; + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + u32 ci, cast; + + for (cast = 0; cast < VNET_N_CAST; cast++) + { + ip_config_main_t * cm = &lm->rx_config_mains[cast]; + vnet_config_main_t * vcm = &cm->config_main; + + if (! vcm->node_index_by_feature_index) + { + if (cast == VNET_UNICAST) + { + static char * start_nodes[] = { "ip4-input", "ip4-input-no-checksum", }; + static char * feature_nodes[] = { + [IP4_RX_FEATURE_CHECK_ACCESS] = "ip4-inacl", + [IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_RX] = "ip4-source-check-via-rx", + [IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_ANY] = "ip4-source-check-via-any", + [IP4_RX_FEATURE_IPSEC] = "ipsec-input-ip4", + [IP4_RX_FEATURE_VPATH] = "vpath-input-ip4", + [IP4_RX_FEATURE_LOOKUP] = "ip4-lookup", + }; + + vnet_config_init (vm, vcm, + start_nodes, ARRAY_LEN (start_nodes), + feature_nodes, ARRAY_LEN (feature_nodes)); + } + else + { + static char * start_nodes[] = { "ip4-input", "ip4-input-no-checksum", }; + static char * feature_nodes[] = { + [IP4_RX_FEATURE_VPATH] = "vpath-input-ip4", + [IP4_RX_FEATURE_LOOKUP] = "ip4-lookup-multicast", + }; + + vnet_config_init (vm, vcm, + start_nodes, ARRAY_LEN (start_nodes), + feature_nodes, ARRAY_LEN (feature_nodes)); + } + } + + vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0); + ci = cm->config_index_by_sw_if_index[sw_if_index]; + + if (is_add) + ci = vnet_config_add_feature (vm, vcm, + ci, + IP4_RX_FEATURE_LOOKUP, + /* config data */ 0, + /* # bytes of config data */ 0); + else + ci = vnet_config_del_feature (vm, vcm, + ci, + IP4_RX_FEATURE_LOOKUP, + /* config data */ 0, + /* # bytes of config data */ 0); + + cm->config_index_by_sw_if_index[sw_if_index] = ci; + } + + return /* no error */ 0; +} + +VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip4_sw_interface_add_del); + +VLIB_REGISTER_NODE (ip4_lookup_node) = { + .function = ip4_lookup, + .name = "ip4-lookup", + .vector_size = sizeof (u32), + + .n_next_nodes = IP_LOOKUP_N_NEXT, + .next_nodes = { + [IP_LOOKUP_NEXT_MISS] = "ip4-miss", + [IP_LOOKUP_NEXT_DROP] = "ip4-drop", + [IP_LOOKUP_NEXT_PUNT] = "ip4-punt", + [IP_LOOKUP_NEXT_LOCAL] = "ip4-local", + [IP_LOOKUP_NEXT_ARP] = "ip4-arp", + [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit", + [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify", + [IP_LOOKUP_NEXT_MAP] = "ip4-map", + [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t", + [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd", + [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop", + [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop", + [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop", + }, +}; + +/* Global IP4 main. */ +ip4_main_t ip4_main; + +clib_error_t * +ip4_lookup_init (vlib_main_t * vm) +{ + ip4_main_t * im = &ip4_main; + uword i; + + for (i = 0; i < ARRAY_LEN (im->fib_masks); i++) + { + u32 m; + + if (i < 32) + m = pow2_mask (i) << (32 - i); + else + m = ~0; + im->fib_masks[i] = clib_host_to_net_u32 (m); + } + + /* Create FIB with index 0 and table id of 0. */ + find_ip4_fib_by_table_index_or_id (im, /* table id */ 0, IP4_ROUTE_FLAG_TABLE_ID); + + ip_lookup_init (&im->lookup_main, /* is_ip6 */ 0); + + { + pg_node_t * pn; + pn = pg_get_node (ip4_lookup_node.index); + pn->unformat_edit = unformat_pg_ip4_header; + } + + { + ethernet_arp_header_t h; + + memset (&h, 0, sizeof (h)); + + /* Set target ethernet address to all zeros. */ + memset (h.ip4_over_ethernet[1].ethernet, 0, sizeof (h.ip4_over_ethernet[1].ethernet)); + +#define _16(f,v) h.f = clib_host_to_net_u16 (v); +#define _8(f,v) h.f = v; + _16 (l2_type, ETHERNET_ARP_HARDWARE_TYPE_ethernet); + _16 (l3_type, ETHERNET_TYPE_IP4); + _8 (n_l2_address_bytes, 6); + _8 (n_l3_address_bytes, 4); + _16 (opcode, ETHERNET_ARP_OPCODE_request); +#undef _16 +#undef _8 + + vlib_packet_template_init (vm, + &im->ip4_arp_request_packet_template, + /* data */ &h, + sizeof (h), + /* alloc chunk size */ 8, + "ip4 arp"); + } + + return 0; +} + +VLIB_INIT_FUNCTION (ip4_lookup_init); + +typedef struct { + /* Adjacency taken. */ + u32 adj_index; + u32 flow_hash; + u32 fib_index; + + /* Packet data, possibly *after* rewrite. */ + u8 packet_data[64 - 1*sizeof(u32)]; +} ip4_forward_next_trace_t; + +static u8 * format_ip4_forward_next_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip4_forward_next_trace_t * t = va_arg (*args, ip4_forward_next_trace_t *); + vnet_main_t * vnm = vnet_get_main(); + ip4_main_t * im = &ip4_main; + ip_adjacency_t * adj; + uword indent = format_get_indent (s); + + adj = ip_get_adjacency (&im->lookup_main, t->adj_index); + s = format (s, "fib: %d adjacency: %U flow hash: 0x%08x", + t->fib_index, format_ip_adjacency, + vnm, &im->lookup_main, t->adj_index, t->flow_hash); + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_REWRITE: + s = format (s, "\n%U%U", + format_white_space, indent, + format_ip_adjacency_packet_data, + vnm, &im->lookup_main, t->adj_index, + t->packet_data, sizeof (t->packet_data)); + break; + + default: + break; + } + + return s; +} + +/* Common trace function for all ip4-forward next nodes. */ +void +ip4_forward_next_trace (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + vlib_rx_or_tx_t which_adj_index) +{ + u32 * from, n_left; + ip4_main_t * im = &ip4_main; + + n_left = frame->n_vectors; + from = vlib_frame_vector_args (frame); + + while (n_left >= 4) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + ip4_forward_next_trace_t * t0, * t1; + + /* Prefetch next iteration. */ + vlib_prefetch_buffer_with_index (vm, from[2], LOAD); + vlib_prefetch_buffer_with_index (vm, from[3], LOAD); + + bi0 = from[0]; + bi1 = from[1]; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index]; + t0->flow_hash = vnet_buffer (b0)->ip.flow_hash; + t0->fib_index = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + memcpy (t0->packet_data, + vlib_buffer_get_current (b0), + sizeof (t0->packet_data)); + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->adj_index = vnet_buffer (b1)->ip.adj_index[which_adj_index]; + t1->flow_hash = vnet_buffer (b1)->ip.flow_hash; + t1->fib_index = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer(b1)->sw_if_index[VLIB_RX]); + memcpy (t1->packet_data, + vlib_buffer_get_current (b1), + sizeof (t1->packet_data)); + } + from += 2; + n_left -= 2; + } + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t * b0; + ip4_forward_next_trace_t * t0; + + bi0 = from[0]; + + b0 = vlib_get_buffer (vm, bi0); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index]; + t0->flow_hash = vnet_buffer (b0)->ip.flow_hash; + t0->fib_index = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer(b0)->sw_if_index[VLIB_RX]); + memcpy (t0->packet_data, + vlib_buffer_get_current (b0), + sizeof (t0->packet_data)); + } + from += 1; + n_left -= 1; + } +} + +static uword +ip4_drop_or_punt (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + ip4_error_t error_code) +{ + u32 * buffers = vlib_frame_vector_args (frame); + uword n_packets = frame->n_vectors; + + vlib_error_drop_buffers (vm, node, + buffers, + /* stride */ 1, + n_packets, + /* next */ 0, + ip4_input_node.index, + error_code); + + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip4_forward_next_trace (vm, node, frame, VLIB_TX); + + return n_packets; +} + +static uword +ip4_drop (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_DROP); } + +static uword +ip4_punt (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_ADJACENCY_PUNT); } + +static uword +ip4_miss (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip4_drop_or_punt (vm, node, frame, IP4_ERROR_DST_LOOKUP_MISS); } + +VLIB_REGISTER_NODE (ip4_drop_node,static) = { + .function = ip4_drop, + .name = "ip4-drop", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; + +VLIB_REGISTER_NODE (ip4_punt_node,static) = { + .function = ip4_punt, + .name = "ip4-punt", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-punt", + }, +}; + +VLIB_REGISTER_NODE (ip4_miss_node,static) = { + .function = ip4_miss, + .name = "ip4-miss", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; + +/* Compute TCP/UDP/ICMP4 checksum in software. */ +u16 +ip4_tcp_udp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, + ip4_header_t * ip0) +{ + ip_csum_t sum0; + u32 ip_header_length, payload_length_host_byte_order; + u32 n_this_buffer, n_bytes_left; + u16 sum16; + void * data_this_buffer; + + /* Initialize checksum with ip header. */ + ip_header_length = ip4_header_bytes (ip0); + payload_length_host_byte_order = clib_net_to_host_u16 (ip0->length) - ip_header_length; + sum0 = clib_host_to_net_u32 (payload_length_host_byte_order + (ip0->protocol << 16)); + + if (BITS (uword) == 32) + { + sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u32)); + sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32)); + } + else + sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->src_address, u64)); + + n_bytes_left = n_this_buffer = payload_length_host_byte_order; + data_this_buffer = (void *) ip0 + ip_header_length; + if (n_this_buffer + ip_header_length > p0->current_length) + n_this_buffer = p0->current_length > ip_header_length ? p0->current_length - ip_header_length : 0; + while (1) + { + sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer); + n_bytes_left -= n_this_buffer; + if (n_bytes_left == 0) + break; + + ASSERT (p0->flags & VLIB_BUFFER_NEXT_PRESENT); + p0 = vlib_get_buffer (vm, p0->next_buffer); + data_this_buffer = vlib_buffer_get_current (p0); + n_this_buffer = p0->current_length; + } + + sum16 = ~ ip_csum_fold (sum0); + + return sum16; +} + +static u32 +ip4_tcp_udp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0) +{ + ip4_header_t * ip0 = vlib_buffer_get_current (p0); + udp_header_t * udp0; + u16 sum16; + + ASSERT (ip0->protocol == IP_PROTOCOL_TCP + || ip0->protocol == IP_PROTOCOL_UDP); + + udp0 = (void *) (ip0 + 1); + if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0) + { + p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED + | IP_BUFFER_L4_CHECKSUM_CORRECT); + return p0->flags; + } + + sum16 = ip4_tcp_udp_compute_checksum (vm, p0, ip0); + + p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED + | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT)); + + return p0->flags; +} + +static uword +ip4_local (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + ip_local_next_t next_index; + u32 * from, * to_next, n_left_from, n_left_to_next; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip4_forward_next_trace (vm, node, frame, VLIB_TX); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t * p0, * p1; + ip4_header_t * ip0, * ip1; + udp_header_t * udp0, * udp1; + ip4_fib_mtrie_t * mtrie0, * mtrie1; + ip4_fib_mtrie_leaf_t leaf0, leaf1; + ip_adjacency_t * adj0, * adj1; + u32 pi0, ip_len0, udp_len0, flags0, next0, fib_index0, adj_index0; + u32 pi1, ip_len1, udp_len1, flags1, next1, fib_index1, adj_index1; + i32 len_diff0, len_diff1; + u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0; + u8 error1, is_udp1, is_tcp_udp1, good_tcp_udp1, proto1; + u8 enqueue_code; + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer(p0)->sw_if_index[VLIB_RX]); + fib_index1 = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer(p1)->sw_if_index[VLIB_RX]); + + mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie; + mtrie1 = &vec_elt_at_index (im->fibs, fib_index1)->mtrie; + + leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 0); + + proto0 = ip0->protocol; + proto1 = ip1->protocol; + is_udp0 = proto0 == IP_PROTOCOL_UDP; + is_udp1 = proto1 == IP_PROTOCOL_UDP; + is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP; + is_tcp_udp1 = is_udp1 || proto1 == IP_PROTOCOL_TCP; + + flags0 = p0->flags; + flags1 = p1->flags; + + good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_tcp_udp1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + + udp0 = ip4_next_header (ip0); + udp1 = ip4_next_header (ip1); + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_tcp_udp0 |= is_udp0 && udp0->checksum == 0; + good_tcp_udp1 |= is_udp1 && udp1->checksum == 0; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 1); + + /* Verify UDP length. */ + ip_len0 = clib_net_to_host_u16 (ip0->length); + ip_len1 = clib_net_to_host_u16 (ip1->length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + udp_len1 = clib_net_to_host_u16 (udp1->length); + + len_diff0 = ip_len0 - udp_len0; + len_diff1 = ip_len1 - udp_len1; + + len_diff0 = is_udp0 ? len_diff0 : 0; + len_diff1 = is_udp1 ? len_diff1 : 0; + + if (PREDICT_FALSE (! (is_tcp_udp0 & is_tcp_udp1 + & good_tcp_udp0 & good_tcp_udp1))) + { + if (is_tcp_udp0) + { + if (is_tcp_udp0 + && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED)) + flags0 = ip4_tcp_udp_validate_checksum (vm, p0); + good_tcp_udp0 = + (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_tcp_udp0 |= is_udp0 && udp0->checksum == 0; + } + if (is_tcp_udp1) + { + if (is_tcp_udp1 + && ! (flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED)) + flags1 = ip4_tcp_udp_validate_checksum (vm, p1); + good_tcp_udp1 = + (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_tcp_udp1 |= is_udp1 && udp1->checksum == 0; + } + } + + good_tcp_udp0 &= len_diff0 >= 0; + good_tcp_udp1 &= len_diff1 >= 0; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 2); + + error0 = error1 = IP4_ERROR_UNKNOWN_PROTOCOL; + + error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0; + error1 = len_diff1 < 0 ? IP4_ERROR_UDP_LENGTH : error1; + + ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM); + error0 = (is_tcp_udp0 && ! good_tcp_udp0 + ? IP4_ERROR_TCP_CHECKSUM + is_udp0 + : error0); + error1 = (is_tcp_udp1 && ! good_tcp_udp1 + ? IP4_ERROR_TCP_CHECKSUM + is_udp1 + : error1); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3); + + vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0; + + vnet_buffer (p1)->ip.adj_index[VLIB_RX] = adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1); + vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1; + + ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0, + &ip0->src_address, + /* no_default_route */ 1)); + ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, fib_index1, + &ip1->src_address, + /* no_default_route */ 1)); + + adj0 = ip_get_adjacency (lm, adj_index0); + adj1 = ip_get_adjacency (lm, adj_index1); + + /* + * Must have a route to source otherwise we drop the packet. + * ip4 broadcasts are accepted, e.g. to make dhcp client work + */ + error0 = (error0 == IP4_ERROR_UNKNOWN_PROTOCOL + && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE + && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP + && adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL + && ip0->dst_address.as_u32 != 0xFFFFFFFF + ? IP4_ERROR_SRC_LOOKUP_MISS + : error0); + error1 = (error1 == IP4_ERROR_UNKNOWN_PROTOCOL + && adj1->lookup_next_index != IP_LOOKUP_NEXT_REWRITE + && adj1->lookup_next_index != IP_LOOKUP_NEXT_ARP + && adj1->lookup_next_index != IP_LOOKUP_NEXT_LOCAL + && ip0->dst_address.as_u32 != 0xFFFFFFFF + ? IP4_ERROR_SRC_LOOKUP_MISS + : error1); + + next0 = lm->local_next_by_ip_protocol[proto0]; + next1 = lm->local_next_by_ip_protocol[proto1]; + + next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0; + next1 = error1 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next1; + + p0->error = error0 ? error_node->errors[error0] : 0; + p1->error = error1 ? error_node->errors[error1] : 0; + + enqueue_code = (next0 != next_index) + 2*(next1 != next_index); + + if (PREDICT_FALSE (enqueue_code != 0)) + { + switch (enqueue_code) + { + case 1: + /* A B A */ + to_next[-2] = pi1; + to_next -= 1; + n_left_to_next += 1; + vlib_set_next_frame_buffer (vm, node, next0, pi0); + break; + + case 2: + /* A A B */ + to_next -= 1; + n_left_to_next += 1; + vlib_set_next_frame_buffer (vm, node, next1, pi1); + break; + + case 3: + /* A B B or A B C */ + to_next -= 2; + n_left_to_next += 2; + vlib_set_next_frame_buffer (vm, node, next0, pi0); + vlib_set_next_frame_buffer (vm, node, next1, pi1); + if (next0 == next1) + { + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + next_index = next1; + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + } + break; + } + } + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + udp_header_t * udp0; + ip4_fib_mtrie_t * mtrie0; + ip4_fib_mtrie_leaf_t leaf0; + ip_adjacency_t * adj0; + u32 pi0, next0, ip_len0, udp_len0, flags0, fib_index0, adj_index0; + i32 len_diff0; + u8 error0, is_udp0, is_tcp_udp0, good_tcp_udp0, proto0; + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + + ip0 = vlib_buffer_get_current (p0); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer(p0)->sw_if_index[VLIB_RX]); + + mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie; + + leaf0 = IP4_FIB_MTRIE_LEAF_ROOT; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0); + + proto0 = ip0->protocol; + is_udp0 = proto0 == IP_PROTOCOL_UDP; + is_tcp_udp0 = is_udp0 || proto0 == IP_PROTOCOL_TCP; + + flags0 = p0->flags; + + good_tcp_udp0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + + udp0 = ip4_next_header (ip0); + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_tcp_udp0 |= is_udp0 && udp0->checksum == 0; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1); + + /* Verify UDP length. */ + ip_len0 = clib_net_to_host_u16 (ip0->length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + + len_diff0 = ip_len0 - udp_len0; + + len_diff0 = is_udp0 ? len_diff0 : 0; + + if (PREDICT_FALSE (! (is_tcp_udp0 & good_tcp_udp0))) + { + if (is_tcp_udp0) + { + if (is_tcp_udp0 + && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED)) + flags0 = ip4_tcp_udp_validate_checksum (vm, p0); + good_tcp_udp0 = + (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_tcp_udp0 |= is_udp0 && udp0->checksum == 0; + } + } + + good_tcp_udp0 &= len_diff0 >= 0; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2); + + error0 = IP4_ERROR_UNKNOWN_PROTOCOL; + + error0 = len_diff0 < 0 ? IP4_ERROR_UDP_LENGTH : error0; + + ASSERT (IP4_ERROR_TCP_CHECKSUM + 1 == IP4_ERROR_UDP_CHECKSUM); + error0 = (is_tcp_udp0 && ! good_tcp_udp0 + ? IP4_ERROR_TCP_CHECKSUM + is_udp0 + : error0); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3); + + vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0; + + ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, fib_index0, + &ip0->src_address, + /* no_default_route */ 1)); + + adj0 = ip_get_adjacency (lm, adj_index0); + + /* Must have a route to source otherwise we drop the packet. */ + error0 = (error0 == IP4_ERROR_UNKNOWN_PROTOCOL + && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE + && adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP + && adj0->lookup_next_index != IP_LOOKUP_NEXT_LOCAL + && ip0->dst_address.as_u32 != 0xFFFFFFFF + ? IP4_ERROR_SRC_LOOKUP_MISS + : error0); + + next0 = lm->local_next_by_ip_protocol[proto0]; + + next0 = error0 != IP4_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0; + + p0->error = error0? error_node->errors[error0] : 0; + + if (PREDICT_FALSE (next0 != next_index)) + { + n_left_to_next += 1; + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + + next_index = next0; + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + to_next[0] = pi0; + to_next += 1; + n_left_to_next -= 1; + } + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip4_local_node,static) = { + .function = ip4_local, + .name = "ip4-local", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_forward_next_trace, + + .n_next_nodes = IP_LOCAL_N_NEXT, + .next_nodes = { + [IP_LOCAL_NEXT_DROP] = "error-drop", + [IP_LOCAL_NEXT_PUNT] = "error-punt", + // [IP_LOCAL_NEXT_TCP_LOOKUP] = "ip4-tcp-lookup", + [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip4-udp-lookup", + [IP_LOCAL_NEXT_ICMP] = "ip4-icmp-input", + }, +}; + +void ip4_register_protocol (u32 protocol, u32 node_index) +{ + vlib_main_t * vm = vlib_get_main(); + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + + ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol)); + lm->local_next_by_ip_protocol[protocol] = vlib_node_add_next (vm, ip4_local_node.index, node_index); +} + +static clib_error_t * +show_ip_local_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + int i; + + vlib_cli_output (vm, "Protocols handled by ip4_local"); + for (i = 0; i < ARRAY_LEN(lm->local_next_by_ip_protocol); i++) + { + if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT) + vlib_cli_output (vm, "%d", i); + } + return 0; +} + + + +VLIB_CLI_COMMAND (show_ip_local, static) = { + .path = "show ip local", + .function = show_ip_local_command_fn, + .short_help = "Show ip local protocol table", +}; + +static uword +ip4_arp (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vnet_main_t * vnm = vnet_get_main(); + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + u32 * from, * to_next_drop; + uword n_left_from, n_left_to_next_drop, next_index; + static f64 time_last_seed_change = -1e100; + static u32 hash_seeds[3]; + static uword hash_bitmap[256 / BITS (uword)]; + f64 time_now; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip4_forward_next_trace (vm, node, frame, VLIB_TX); + + time_now = vlib_time_now (vm); + if (time_now - time_last_seed_change > 1e-3) + { + uword i; + u32 * r = clib_random_buffer_get_data (&vm->random_buffer, + sizeof (hash_seeds)); + for (i = 0; i < ARRAY_LEN (hash_seeds); i++) + hash_seeds[i] = r[i]; + + /* Mark all hash keys as been no-seen before. */ + for (i = 0; i < ARRAY_LEN (hash_bitmap); i++) + hash_bitmap[i] = 0; + + time_last_seed_change = time_now; + } + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + if (next_index == IP4_ARP_NEXT_DROP) + next_index = IP4_ARP_N_NEXT; /* point to first interface */ + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, IP4_ARP_NEXT_DROP, + to_next_drop, n_left_to_next_drop); + + while (n_left_from > 0 && n_left_to_next_drop > 0) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + ethernet_header_t * eh0; + u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0; + uword bm0; + ip_adjacency_t * adj0; + + pi0 = from[0]; + + p0 = vlib_get_buffer (vm, pi0); + + adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + adj0 = ip_get_adjacency (lm, adj_index0); + ip0 = vlib_buffer_get_current (p0); + + /* + * if ip4_rewrite_local applied the IP_LOOKUP_NEXT_ARP + * rewrite to this packet, we need to skip it here. + * Note, to distinguish from src IP addr *.8.6.*, we + * check for a bcast eth dest instead of IPv4 version. + */ + eh0 = (ethernet_header_t*)ip0; + if ((ip0->ip_version_and_header_length & 0xF0) != 0x40) + { + u32 vlan_num = 0; + u16 * etype = &eh0->type; + while ((*etype == clib_host_to_net_u16 (0x8100)) //dot1q + || (*etype == clib_host_to_net_u16 (0x88a8)))//dot1ad + { + vlan_num += 1; + etype += 2; //vlan tag also 16 bits, same as etype + } + if (*etype == clib_host_to_net_u16 (0x0806)) //arp + { + vlib_buffer_advance ( + p0, sizeof(ethernet_header_t) + (4*vlan_num)); + ip0 = vlib_buffer_get_current (p0); + } + } + + a0 = hash_seeds[0]; + b0 = hash_seeds[1]; + c0 = hash_seeds[2]; + + sw_if_index0 = adj0->rewrite_header.sw_if_index; + vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0; + + a0 ^= ip0->dst_address.data_u32; + b0 ^= sw_if_index0; + + hash_v3_finalize32 (a0, b0, c0); + + c0 &= BITS (hash_bitmap) - 1; + c0 = c0 / BITS (uword); + m0 = (uword) 1 << (c0 % BITS (uword)); + + bm0 = hash_bitmap[c0]; + drop0 = (bm0 & m0) != 0; + + /* Mark it as seen. */ + hash_bitmap[c0] = bm0 | m0; + + from += 1; + n_left_from -= 1; + to_next_drop[0] = pi0; + to_next_drop += 1; + n_left_to_next_drop -= 1; + + p0->error = node->errors[drop0 ? IP4_ARP_ERROR_DROP : IP4_ARP_ERROR_REQUEST_SENT]; + + if (drop0) + continue; + + /* + * Can happen if the control-plane is programming tables + * with traffic flowing; at least that's today's lame excuse. + */ + if (adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP) + { + p0->error = node->errors[IP4_ARP_ERROR_NON_ARP_ADJ]; + } + else + /* Send ARP request. */ + { + u32 bi0 = 0; + vlib_buffer_t * b0; + ethernet_arp_header_t * h0; + vnet_hw_interface_t * hw_if0; + + h0 = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi0); + + /* Add rewrite/encap string for ARP packet. */ + vnet_rewrite_one_header (adj0[0], h0, sizeof (ethernet_header_t)); + + hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0); + + /* Src ethernet address in ARP header. */ + memcpy (h0->ip4_over_ethernet[0].ethernet, hw_if0->hw_address, + sizeof (h0->ip4_over_ethernet[0].ethernet)); + + ip4_src_address_for_packet (im, p0, &h0->ip4_over_ethernet[0].ip4, sw_if_index0); + + /* Copy in destination address we are requesting. */ + h0->ip4_over_ethernet[1].ip4.data_u32 = ip0->dst_address.data_u32; + + vlib_buffer_copy_trace_flag (vm, p0, bi0); + b0 = vlib_get_buffer (vm, bi0); + vnet_buffer (b0)->sw_if_index[VLIB_TX] = sw_if_index0; + + vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes); + + vlib_set_next_frame_buffer (vm, node, adj0->rewrite_header.next_index, bi0); + } + } + + vlib_put_next_frame (vm, node, IP4_ARP_NEXT_DROP, n_left_to_next_drop); + } + + return frame->n_vectors; +} + +static char * ip4_arp_error_strings[] = { + [IP4_ARP_ERROR_DROP] = "address overflow drops", + [IP4_ARP_ERROR_REQUEST_SENT] = "ARP requests sent", + [IP4_ARP_ERROR_NON_ARP_ADJ] = "ARPs to non-ARP adjacencies", + [IP4_ARP_ERROR_REPLICATE_DROP] = "ARP replication completed", + [IP4_ARP_ERROR_REPLICATE_FAIL] = "ARP replication failed", +}; + +VLIB_REGISTER_NODE (ip4_arp_node) = { + .function = ip4_arp, + .name = "ip4-arp", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_forward_next_trace, + + .n_errors = ARRAY_LEN (ip4_arp_error_strings), + .error_strings = ip4_arp_error_strings, + + .n_next_nodes = IP4_ARP_N_NEXT, + .next_nodes = { + [IP4_ARP_NEXT_DROP] = "error-drop", + }, +}; + +#define foreach_notrace_ip4_arp_error \ +_(DROP) \ +_(REQUEST_SENT) \ +_(REPLICATE_DROP) \ +_(REPLICATE_FAIL) + +clib_error_t * arp_notrace_init (vlib_main_t * vm) +{ + vlib_node_runtime_t *rt = + vlib_node_get_runtime (vm, ip4_arp_node.index); + + /* don't trace ARP request packets */ +#define _(a) \ + vnet_pcap_drop_trace_filter_add_del \ + (rt->errors[IP4_ARP_ERROR_##a], \ + 1 /* is_add */); + foreach_notrace_ip4_arp_error; +#undef _ + return 0; +} + +VLIB_INIT_FUNCTION(arp_notrace_init); + + +/* Send an ARP request to see if given destination is reachable on given interface. */ +clib_error_t * +ip4_probe_neighbor (vlib_main_t * vm, ip4_address_t * dst, u32 sw_if_index) +{ + vnet_main_t * vnm = vnet_get_main(); + ip4_main_t * im = &ip4_main; + ethernet_arp_header_t * h; + ip4_address_t * src; + ip_interface_address_t * ia; + ip_adjacency_t * adj; + vnet_hw_interface_t * hi; + vnet_sw_interface_t * si; + vlib_buffer_t * b; + u32 bi = 0; + + si = vnet_get_sw_interface (vnm, sw_if_index); + + if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)) + { + return clib_error_return (0, "%U: interface %U down", + format_ip4_address, dst, + format_vnet_sw_if_index_name, vnm, + sw_if_index); + } + + src = ip4_interface_address_matching_destination (im, dst, sw_if_index, &ia); + if (! src) + { + vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE; + return clib_error_return + (0, "no matching interface address for destination %U (interface %U)", + format_ip4_address, dst, + format_vnet_sw_if_index_name, vnm, sw_if_index); + } + + adj = ip_get_adjacency (&im->lookup_main, ia->neighbor_probe_adj_index); + + h = vlib_packet_template_get_packet (vm, &im->ip4_arp_request_packet_template, &bi); + + hi = vnet_get_sup_hw_interface (vnm, sw_if_index); + + memcpy (h->ip4_over_ethernet[0].ethernet, hi->hw_address, sizeof (h->ip4_over_ethernet[0].ethernet)); + + h->ip4_over_ethernet[0].ip4 = src[0]; + h->ip4_over_ethernet[1].ip4 = dst[0]; + + b = vlib_get_buffer (vm, bi); + vnet_buffer (b)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index; + + /* Add encapsulation string for software interface (e.g. ethernet header). */ + vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t)); + vlib_buffer_advance (b, -adj->rewrite_header.data_bytes); + + { + vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index); + u32 * to_next = vlib_frame_vector_args (f); + to_next[0] = bi; + f->n_vectors = 1; + vlib_put_frame_to_node (vm, hi->output_node_index, f); + } + + return /* no error */ 0; +} + +typedef enum { + IP4_REWRITE_NEXT_DROP, + IP4_REWRITE_NEXT_ARP, +} ip4_rewrite_next_t; + +always_inline uword +ip4_rewrite_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + int rewrite_for_locally_received_packets) +{ + ip_lookup_main_t * lm = &ip4_main.lookup_main; + u32 * from = vlib_frame_vector_args (frame); + u32 n_left_from, n_left_to_next, * to_next, next_index; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index); + vlib_rx_or_tx_t adj_rx_tx = rewrite_for_locally_received_packets ? VLIB_RX : VLIB_TX; + + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + u32 cpu_index = os_get_cpu_number(); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + ip_adjacency_t * adj0, * adj1; + vlib_buffer_t * p0, * p1; + ip4_header_t * ip0, * ip1; + u32 pi0, rw_len0, next0, error0, checksum0, adj_index0; + u32 pi1, rw_len1, next1, error1, checksum1, adj_index1; + u32 next0_override, next1_override; + + if (rewrite_for_locally_received_packets) + next0_override = next1_override = 0; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, STORE); + vlib_prefetch_buffer_header (p3, STORE); + + CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE); + CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx]; + adj_index1 = vnet_buffer (p1)->ip.adj_index[adj_rx_tx]; + + /* We should never rewrite a pkt using the MISS adjacency */ + ASSERT(adj_index0 && adj_index1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + error0 = error1 = IP4_ERROR_NONE; + + /* Decrement TTL & update checksum. + Works either endian, so no need for byte swap. */ + if (! rewrite_for_locally_received_packets) + { + i32 ttl0 = ip0->ttl, ttl1 = ip1->ttl; + + /* Input node should have reject packets with ttl 0. */ + ASSERT (ip0->ttl > 0); + ASSERT (ip1->ttl > 0); + + checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100); + checksum1 = ip1->checksum + clib_host_to_net_u16 (0x0100); + + checksum0 += checksum0 >= 0xffff; + checksum1 += checksum1 >= 0xffff; + + ip0->checksum = checksum0; + ip1->checksum = checksum1; + + ttl0 -= 1; + ttl1 -= 1; + + ip0->ttl = ttl0; + ip1->ttl = ttl1; + + error0 = ttl0 <= 0 ? IP4_ERROR_TIME_EXPIRED : error0; + error1 = ttl1 <= 0 ? IP4_ERROR_TIME_EXPIRED : error1; + + /* Verify checksum. */ + ASSERT (ip0->checksum == ip4_header_checksum (ip0)); + ASSERT (ip1->checksum == ip4_header_checksum (ip1)); + } + + /* Rewrite packet header and updates lengths. */ + adj0 = ip_get_adjacency (lm, adj_index0); + adj1 = ip_get_adjacency (lm, adj_index1); + + if (rewrite_for_locally_received_packets) + { + /* + * If someone sends e.g. an icmp4 w/ src = dst = interface addr, + * we end up here with a local adjacency in hand + * The local adj rewrite data is 0xfefe on purpose. + * Bad engineer, no donut for you. + */ + if (PREDICT_FALSE(adj0->lookup_next_index + == IP_LOOKUP_NEXT_LOCAL)) + error0 = IP4_ERROR_SPOOFED_LOCAL_PACKETS; + if (PREDICT_FALSE(adj0->lookup_next_index + == IP_LOOKUP_NEXT_ARP)) + next0_override = IP4_REWRITE_NEXT_ARP; + if (PREDICT_FALSE(adj1->lookup_next_index + == IP_LOOKUP_NEXT_LOCAL)) + error1 = IP4_ERROR_SPOOFED_LOCAL_PACKETS; + if (PREDICT_FALSE(adj1->lookup_next_index + == IP_LOOKUP_NEXT_ARP)) + next1_override = IP4_REWRITE_NEXT_ARP; + } + + /* Worth pipelining. No guarantee that adj0,1 are hot... */ + rw_len0 = adj0[0].rewrite_header.data_bytes; + rw_len1 = adj1[0].rewrite_header.data_bytes; + next0 = (error0 == IP4_ERROR_NONE) + ? adj0[0].rewrite_header.next_index : 0; + + if (rewrite_for_locally_received_packets) + next0 = next0 && next0_override ? next0_override : next0; + + next1 = (error1 == IP4_ERROR_NONE) + ? adj1[0].rewrite_header.next_index : 0; + + if (rewrite_for_locally_received_packets) + next1 = next1 && next1_override ? next1_override : next1; + + /* + * We've already accounted for an ethernet_header_t elsewhere + */ + if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t))) + vlib_increment_combined_counter + (&lm->adjacency_counters, + cpu_index, adj_index0, + /* packet increment */ 0, + /* byte increment */ rw_len0-sizeof(ethernet_header_t)); + + if (PREDICT_FALSE (rw_len1 > sizeof(ethernet_header_t))) + vlib_increment_combined_counter + (&lm->adjacency_counters, + cpu_index, adj_index1, + /* packet increment */ 0, + /* byte increment */ rw_len1-sizeof(ethernet_header_t)); + + /* Check MTU of outgoing interface. */ + error0 = (vlib_buffer_length_in_chain (vm, p0) > adj0[0].rewrite_header.max_l3_packet_bytes + ? IP4_ERROR_MTU_EXCEEDED + : error0); + error1 = (vlib_buffer_length_in_chain (vm, p1) > adj1[0].rewrite_header.max_l3_packet_bytes + ? IP4_ERROR_MTU_EXCEEDED + : error1); + + p0->current_data -= rw_len0; + p1->current_data -= rw_len1; + + p0->current_length += rw_len0; + p1->current_length += rw_len1; + + vnet_buffer (p0)->sw_if_index[VLIB_TX] = adj0[0].rewrite_header.sw_if_index; + vnet_buffer (p1)->sw_if_index[VLIB_TX] = adj1[0].rewrite_header.sw_if_index; + + p0->error = error_node->errors[error0]; + p1->error = error_node->errors[error1]; + + /* Guess we are only writing on simple Ethernet header. */ + vnet_rewrite_two_headers (adj0[0], adj1[0], + ip0, ip1, + sizeof (ethernet_header_t)); + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + ip_adjacency_t * adj0; + vlib_buffer_t * p0; + ip4_header_t * ip0; + u32 pi0, rw_len0, adj_index0, next0, error0, checksum0; + u32 next0_override; + + if (rewrite_for_locally_received_packets) + next0_override = 0; + + pi0 = to_next[0] = from[0]; + + p0 = vlib_get_buffer (vm, pi0); + + adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx]; + + /* We should never rewrite a pkt using the MISS adjacency */ + ASSERT(adj_index0); + + adj0 = ip_get_adjacency (lm, adj_index0); + + ip0 = vlib_buffer_get_current (p0); + + error0 = IP4_ERROR_NONE; + next0 = 0; /* drop on error */ + + /* Decrement TTL & update checksum. */ + if (! rewrite_for_locally_received_packets) + { + i32 ttl0 = ip0->ttl; + + checksum0 = ip0->checksum + clib_host_to_net_u16 (0x0100); + + checksum0 += checksum0 >= 0xffff; + + ip0->checksum = checksum0; + + ASSERT (ip0->ttl > 0); + + ttl0 -= 1; + + ip0->ttl = ttl0; + + ASSERT (ip0->checksum == ip4_header_checksum (ip0)); + + error0 = ttl0 <= 0 ? IP4_ERROR_TIME_EXPIRED : error0; + } + + if (rewrite_for_locally_received_packets) + { + /* + * If someone sends e.g. an icmp4 w/ src = dst = interface addr, + * we end up here with a local adjacency in hand + * The local adj rewrite data is 0xfefe on purpose. + * Bad engineer, no donut for you. + */ + if (PREDICT_FALSE(adj0->lookup_next_index + == IP_LOOKUP_NEXT_LOCAL)) + error0 = IP4_ERROR_SPOOFED_LOCAL_PACKETS; + /* + * We have to override the next_index in ARP adjacencies, + * because they're set up for ip4-arp, not this node... + */ + if (PREDICT_FALSE(adj0->lookup_next_index + == IP_LOOKUP_NEXT_ARP)) + next0_override = IP4_REWRITE_NEXT_ARP; + } + + /* Guess we are only writing on simple Ethernet header. */ + vnet_rewrite_one_header (adj0[0], ip0, + sizeof (ethernet_header_t)); + + /* Update packet buffer attributes/set output interface. */ + rw_len0 = adj0[0].rewrite_header.data_bytes; + + if (PREDICT_FALSE (rw_len0 > sizeof(ethernet_header_t))) + vlib_increment_combined_counter + (&lm->adjacency_counters, + cpu_index, adj_index0, + /* packet increment */ 0, + /* byte increment */ rw_len0-sizeof(ethernet_header_t)); + + /* Check MTU of outgoing interface. */ + error0 = (vlib_buffer_length_in_chain (vm, p0) + > adj0[0].rewrite_header.max_l3_packet_bytes + ? IP4_ERROR_MTU_EXCEEDED + : error0); + + p0->error = error_node->errors[error0]; + p0->current_data -= rw_len0; + p0->current_length += rw_len0; + vnet_buffer (p0)->sw_if_index[VLIB_TX] = + adj0[0].rewrite_header.sw_if_index; + + next0 = (error0 == IP4_ERROR_NONE) + ? adj0[0].rewrite_header.next_index : 0; + + if (rewrite_for_locally_received_packets) + next0 = next0 && next0_override ? next0_override : next0; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* Need to do trace after rewrites to pick up new packet data. */ + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip4_forward_next_trace (vm, node, frame, adj_rx_tx); + + return frame->n_vectors; +} + +static uword +ip4_rewrite_transit (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip4_rewrite_inline (vm, node, frame, + /* rewrite_for_locally_received_packets */ 0); +} + +static uword +ip4_rewrite_local (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip4_rewrite_inline (vm, node, frame, + /* rewrite_for_locally_received_packets */ 1); +} + +VLIB_REGISTER_NODE (ip4_rewrite_node) = { + .function = ip4_rewrite_transit, + .name = "ip4-rewrite-transit", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_forward_next_trace, + + .n_next_nodes = 2, + .next_nodes = { + [IP4_REWRITE_NEXT_DROP] = "error-drop", + [IP4_REWRITE_NEXT_ARP] = "ip4-arp", + }, +}; + +VLIB_REGISTER_NODE (ip4_rewrite_local_node,static) = { + .function = ip4_rewrite_local, + .name = "ip4-rewrite-local", + .vector_size = sizeof (u32), + + .sibling_of = "ip4-rewrite-transit", + + .format_trace = format_ip4_forward_next_trace, + + .n_next_nodes = 2, + .next_nodes = { + [IP4_REWRITE_NEXT_DROP] = "error-drop", + [IP4_REWRITE_NEXT_ARP] = "ip4-arp", + }, +}; + +static clib_error_t * +add_del_interface_table (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * error = 0; + u32 sw_if_index, table_id; + + sw_if_index = ~0; + + if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + error = clib_error_return (0, "unknown interface `%U'", + format_unformat_error, input); + goto done; + } + + if (unformat (input, "%d", &table_id)) + ; + else + { + error = clib_error_return (0, "expected table id `%U'", + format_unformat_error, input); + goto done; + } + + { + ip4_main_t * im = &ip4_main; + ip4_fib_t * fib = find_ip4_fib_by_table_index_or_id (im, table_id, IP4_ROUTE_FLAG_TABLE_ID); + + if (fib) + { + vec_validate (im->fib_index_by_sw_if_index, sw_if_index); + im->fib_index_by_sw_if_index[sw_if_index] = fib->index; + } + } + + done: + return error; +} + +VLIB_CLI_COMMAND (set_interface_ip_table_command, static) = { + .path = "set interface ip table", + .function = add_del_interface_table, + .short_help = "Add/delete FIB table id for interface", +}; + + +static uword +ip4_lookup_multicast (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters; + u32 n_left_from, n_left_to_next, * from, * to_next; + ip_lookup_next_t next; + u32 cpu_index = os_get_cpu_number(); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next = node->cached_next_index; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t * p0, * p1; + u32 pi0, pi1, adj_index0, adj_index1, wrong_next; + ip_lookup_next_t next0, next1; + ip4_header_t * ip0, * ip1; + ip_adjacency_t * adj0, * adj1; + u32 fib_index0, fib_index1; + u32 flow_hash_config0, flow_hash_config1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]); + fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]); + fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ? + fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX]; + fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ? + fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX]; + + adj_index0 = ip4_fib_lookup_buffer (im, fib_index0, + &ip0->dst_address, p0); + adj_index1 = ip4_fib_lookup_buffer (im, fib_index1, + &ip1->dst_address, p1); + + adj0 = ip_get_adjacency (lm, adj_index0); + adj1 = ip_get_adjacency (lm, adj_index1); + + next0 = adj0->lookup_next_index; + next1 = adj1->lookup_next_index; + + flow_hash_config0 = + vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config; + + flow_hash_config1 = + vec_elt_at_index (im->fibs, fib_index1)->flow_hash_config; + + vnet_buffer (p0)->ip.flow_hash = ip4_compute_flow_hash + (ip0, flow_hash_config0); + + vnet_buffer (p1)->ip.flow_hash = ip4_compute_flow_hash + (ip1, flow_hash_config1); + + ASSERT (adj0->n_adj > 0); + ASSERT (adj1->n_adj > 0); + ASSERT (is_pow2 (adj0->n_adj)); + ASSERT (is_pow2 (adj1->n_adj)); + adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1)); + adj_index1 += (vnet_buffer (p1)->ip.flow_hash & (adj1->n_adj - 1)); + + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0; + vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1; + + if (1) /* $$$$$$ HACK FIXME */ + vlib_increment_combined_counter + (cm, cpu_index, adj_index0, 1, + vlib_buffer_length_in_chain (vm, p0)); + if (1) /* $$$$$$ HACK FIXME */ + vlib_increment_combined_counter + (cm, cpu_index, adj_index1, 1, + vlib_buffer_length_in_chain (vm, p1)); + + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + wrong_next = (next0 != next) + 2*(next1 != next); + if (PREDICT_FALSE (wrong_next != 0)) + { + switch (wrong_next) + { + case 1: + /* A B A */ + to_next[-2] = pi1; + to_next -= 1; + n_left_to_next += 1; + vlib_set_next_frame_buffer (vm, node, next0, pi0); + break; + + case 2: + /* A A B */ + to_next -= 1; + n_left_to_next += 1; + vlib_set_next_frame_buffer (vm, node, next1, pi1); + break; + + case 3: + /* A B C */ + to_next -= 2; + n_left_to_next += 2; + vlib_set_next_frame_buffer (vm, node, next0, pi0); + vlib_set_next_frame_buffer (vm, node, next1, pi1); + if (next0 == next1) + { + /* A B B */ + vlib_put_next_frame (vm, node, next, n_left_to_next); + next = next1; + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + } + } + } + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + u32 pi0, adj_index0; + ip_lookup_next_t next0; + ip_adjacency_t * adj0; + u32 fib_index0; + u32 flow_hash_config0; + + pi0 = from[0]; + to_next[0] = pi0; + + p0 = vlib_get_buffer (vm, pi0); + + ip0 = vlib_buffer_get_current (p0); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, + vnet_buffer (p0)->sw_if_index[VLIB_RX]); + fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ? + fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX]; + + adj_index0 = ip4_fib_lookup_buffer (im, fib_index0, + &ip0->dst_address, p0); + + adj0 = ip_get_adjacency (lm, adj_index0); + + next0 = adj0->lookup_next_index; + + flow_hash_config0 = + vec_elt_at_index (im->fibs, fib_index0)->flow_hash_config; + + vnet_buffer (p0)->ip.flow_hash = + ip4_compute_flow_hash (ip0, flow_hash_config0); + + ASSERT (adj0->n_adj > 0); + ASSERT (is_pow2 (adj0->n_adj)); + adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1)); + + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0; + + if (1) /* $$$$$$ HACK FIXME */ + vlib_increment_combined_counter + (cm, cpu_index, adj_index0, 1, + vlib_buffer_length_in_chain (vm, p0)); + + from += 1; + to_next += 1; + n_left_to_next -= 1; + n_left_from -= 1; + + if (PREDICT_FALSE (next0 != next)) + { + n_left_to_next += 1; + vlib_put_next_frame (vm, node, next, n_left_to_next); + next = next0; + vlib_get_next_frame (vm, node, next, + to_next, n_left_to_next); + to_next[0] = pi0; + to_next += 1; + n_left_to_next -= 1; + } + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip4_lookup_multicast_node,static) = { + .function = ip4_lookup_multicast, + .name = "ip4-lookup-multicast", + .vector_size = sizeof (u32), + + .n_next_nodes = IP_LOOKUP_N_NEXT, + .next_nodes = { + [IP_LOOKUP_NEXT_MISS] = "ip4-miss", + [IP_LOOKUP_NEXT_DROP] = "ip4-drop", + [IP_LOOKUP_NEXT_PUNT] = "ip4-punt", + [IP_LOOKUP_NEXT_LOCAL] = "ip4-local", + [IP_LOOKUP_NEXT_ARP] = "ip4-arp", + [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit", + [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify", + [IP_LOOKUP_NEXT_MAP] = "ip4-map", + [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t", + [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd", + [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop", + [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop", + [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop", + }, +}; + +VLIB_REGISTER_NODE (ip4_multicast_node,static) = { + .function = ip4_drop, + .name = "ip4-multicast", + .vector_size = sizeof (u32), + + .format_trace = format_ip4_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; + +int ip4_lookup_validate (ip4_address_t *a, u32 fib_index0) +{ + ip4_main_t * im = &ip4_main; + ip4_fib_mtrie_t * mtrie0; + ip4_fib_mtrie_leaf_t leaf0; + u32 adj_index0; + + mtrie0 = &vec_elt_at_index (im->fibs, fib_index0)->mtrie; + + leaf0 = IP4_FIB_MTRIE_LEAF_ROOT; + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 0); + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 1); + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 2); + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, a, 3); + + /* Handle default route. */ + leaf0 = (leaf0 == IP4_FIB_MTRIE_LEAF_EMPTY ? mtrie0->default_leaf : leaf0); + + adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + + return adj_index0 == ip4_fib_lookup_with_table (im, fib_index0, + a, + /* no_default_route */ 0); +} + +static clib_error_t * +test_lookup_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u32 table_id = 0; + f64 count = 1; + u32 n; + int i; + ip4_address_t ip4_base_address; + u64 errors = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { + if (unformat (input, "table %d", &table_id)) + ; + else if (unformat (input, "count %f", &count)) + ; + + else if (unformat (input, "%U", + unformat_ip4_address, &ip4_base_address)) + ; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + } + + n = count; + + for (i = 0; i < n; i++) + { + if (!ip4_lookup_validate (&ip4_base_address, table_id)) + errors++; + + ip4_base_address.as_u32 = + clib_host_to_net_u32 (1 + + clib_net_to_host_u32 (ip4_base_address.as_u32)); + } + + if (errors) + vlib_cli_output (vm, "%llu errors out of %d lookups\n", errors, n); + else + vlib_cli_output (vm, "No errors in %d lookups\n", n); + + return 0; +} + +VLIB_CLI_COMMAND (lookup_test_command, static) = { + .path = "test lookup", + .short_help = "test lookup", + .function = test_lookup_command_fn, +}; + +int vnet_set_ip4_flow_hash (u32 table_id, u32 flow_hash_config) +{ + ip4_main_t * im4 = &ip4_main; + ip4_fib_t * fib; + uword * p = hash_get (im4->fib_index_by_table_id, table_id); + + if (p == 0) + return VNET_API_ERROR_NO_SUCH_FIB; + + fib = vec_elt_at_index (im4->fibs, p[0]); + + fib->flow_hash_config = flow_hash_config; + return 0; +} + +static clib_error_t * +set_ip_flow_hash_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + int matched = 0; + u32 table_id = 0; + u32 flow_hash_config = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { + if (unformat (input, "table %d", &table_id)) + matched = 1; +#define _(a,v) \ + else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;} + foreach_flow_hash_bit +#undef _ + else break; + } + + if (matched == 0) + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + + rv = vnet_set_ip4_flow_hash (table_id, flow_hash_config); + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_NO_SUCH_FIB: + return clib_error_return (0, "no such FIB table %d", table_id); + + default: + clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config); + break; + } + + return 0; +} + +VLIB_CLI_COMMAND (set_ip_flow_hash_command, static) = { + .path = "set ip flow-hash", + .short_help = + "set ip table flow-hash table <fib-id> src dst sport dport proto reverse", + .function = set_ip_flow_hash_command_fn, +}; + +int vnet_set_ip4_classify_intfc (vlib_main_t * vm, u32 sw_if_index, + u32 table_index) +{ + vnet_main_t * vnm = vnet_get_main(); + vnet_interface_main_t * im = &vnm->interface_main; + ip4_main_t * ipm = &ip4_main; + ip_lookup_main_t * lm = &ipm->lookup_main; + vnet_classify_main_t * cm = &vnet_classify_main; + + if (pool_is_free_index (im->sw_interfaces, sw_if_index)) + return VNET_API_ERROR_NO_MATCHING_INTERFACE; + + if (table_index != ~0 && pool_is_free_index (cm->tables, table_index)) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index); + lm->classify_table_index_by_sw_if_index [sw_if_index] = table_index; + + return 0; +} + +static clib_error_t * +set_ip_classify_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u32 table_index = ~0; + int table_index_set = 0; + u32 sw_if_index = ~0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { + if (unformat (input, "table-index %d", &table_index)) + table_index_set = 1; + else if (unformat (input, "intfc %U", unformat_vnet_sw_interface, + vnet_get_main(), &sw_if_index)) + ; + else + break; + } + + if (table_index_set == 0) + return clib_error_return (0, "classify table-index must be specified"); + + if (sw_if_index == ~0) + return clib_error_return (0, "interface / subif must be specified"); + + rv = vnet_set_ip4_classify_intfc (vm, sw_if_index, table_index); + + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_NO_MATCHING_INTERFACE: + return clib_error_return (0, "No such interface"); + + case VNET_API_ERROR_NO_SUCH_ENTRY: + return clib_error_return (0, "No such classifier table"); + } + return 0; +} + +VLIB_CLI_COMMAND (set_ip_classify_command, static) = { + .path = "set ip classify", + .short_help = + "set ip classify intfc <int> table-index <index>", + .function = set_ip_classify_command_fn, +}; + diff --git a/vnet/vnet/ip/ip4_hop_by_hop.c b/vnet/vnet/ip/ip4_hop_by_hop.c new file mode 100644 index 00000000000..ee2bcc0ae75 --- /dev/null +++ b/vnet/vnet/ip/ip4_hop_by_hop.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> + +#include <vnet/ip/ip.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> + +typedef struct { + /* convenience */ + vlib_main_t * vlib_main; + vnet_main_t * vnet_main; +} ip4_hop_by_hop_main_t; + +ip4_hop_by_hop_main_t ip4_hop_by_hop_main; + +vlib_node_registration_t ip4_hop_by_hop_node; + +typedef struct { + u32 next_index; +} ip4_hop_by_hop_trace_t; + +/* packet trace format function */ +static u8 * format_ip4_hop_by_hop_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip4_hop_by_hop_trace_t * t = va_arg (*args, ip4_hop_by_hop_trace_t *); + + s = format (s, "IP4_HOP_BY_HOP: next index %d", + t->next_index); + return s; +} + +vlib_node_registration_t ip4_hop_by_hop_node; + +#define foreach_ip4_hop_by_hop_error \ +_(PROCESSED, "Pkts with ip4 hop-by-hop options") + +typedef enum { +#define _(sym,str) IP4_HOP_BY_HOP_ERROR_##sym, + foreach_ip4_hop_by_hop_error +#undef _ + IP4_HOP_BY_HOP_N_ERROR, +} ip4_hop_by_hop_error_t; + +static char * ip4_hop_by_hop_error_strings[] = { +#define _(sym,string) string, + foreach_ip4_hop_by_hop_error +#undef _ +}; + +static uword +ip4_hop_by_hop_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + u32 n_left_from, * from, * to_next; + ip_lookup_next_t next_index; + u32 processed = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + +#if 0 + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 next0 = IP4_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT; + u32 next1 = IP4_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT; + u32 sw_if_index0, sw_if_index1; + u8 tmp0[6], tmp1[6]; + ethernet_header_t *en0, *en1; + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* $$$$$ Dual loop: process 2 x packets here $$$$$ */ + ASSERT (b0->current_data == 0); + ASSERT (b1->current_data == 0); + + ip0 = vlib_buffer_get_current (b0); + ip1 = vlib_buffer_get_current (b0); + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; + + /* $$$$$ End of processing 2 x packets $$$$$ */ + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + ip4_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + ip4_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->sw_if_index = sw_if_index1; + t->next_index = next1; + } + } + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } +#endif + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + u32 adj_index0; + ip_adjacency_t * adj0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; + adj0 = ip_get_adjacency (lm, adj_index0); + + /* $$$$$$$$$$$$ process one (or more) hop-by-hop header(s) here */ + + + /* $$$$$$$$$$$$ */ + + /* Send the packet e.g. to ip4_rewrite */ + next0 = adj0->lookup_next_index; + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + ip4_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + } + + processed++; + + /* $$$$$ Done processing 1 packet here $$$$$ */ + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, ip4_hop_by_hop_node.index, + IP4_HOP_BY_HOP_ERROR_PROCESSED, processed); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip4_hop_by_hop_node) = { + .function = ip4_hop_by_hop_node_fn, + .name = "ip4-hop-by-hop", + .vector_size = sizeof (u32), + .format_trace = format_ip4_hop_by_hop_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(ip4_hop_by_hop_error_strings), + .error_strings = ip4_hop_by_hop_error_strings, + + /* See ip/lookup.h */ + .n_next_nodes = IP_LOOKUP_N_NEXT, + .next_nodes = { + [IP_LOOKUP_NEXT_MISS] = "ip4-miss", + [IP_LOOKUP_NEXT_DROP] = "ip4-drop", + [IP_LOOKUP_NEXT_PUNT] = "ip4-punt", + [IP_LOOKUP_NEXT_LOCAL] = "ip4-local", + [IP_LOOKUP_NEXT_ARP] = "ip4-arp", + [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit", + [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify", + [IP_LOOKUP_NEXT_MAP] = "ip4-map", + [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t", + [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd", + [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop", /* probably not */ + [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop", + [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop", + }, +}; + +VLIB_REGISTER_NODE (ip4_add_hop_by_hop_node) = { + .function = ip4_hop_by_hop_node_fn, + .name = "ip4-add-hop-by-hop", + .vector_size = sizeof (u32), + .format_trace = format_ip4_hop_by_hop_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(ip4_hop_by_hop_error_strings), + .error_strings = ip4_hop_by_hop_error_strings, + + /* See ip/lookup.h */ + .n_next_nodes = IP_LOOKUP_N_NEXT, + .next_nodes = { + [IP_LOOKUP_NEXT_MISS] = "ip4-miss", + [IP_LOOKUP_NEXT_DROP] = "ip4-drop", + [IP_LOOKUP_NEXT_PUNT] = "ip4-punt", + [IP_LOOKUP_NEXT_LOCAL] = "ip4-local", + [IP_LOOKUP_NEXT_ARP] = "ip4-arp", + [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit", + [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify", + [IP_LOOKUP_NEXT_MAP] = "ip4-map", + [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t", + [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd", + [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop", /* probably not */ + [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop", + [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop", + }, +}; + +VLIB_REGISTER_NODE (ip4_pop_hop_by_hop_node) = { + .function = ip4_hop_by_hop_node_fn, + .name = "ip4-pop-hop-by-hop", + .vector_size = sizeof (u32), + .format_trace = format_ip4_hop_by_hop_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(ip4_hop_by_hop_error_strings), + .error_strings = ip4_hop_by_hop_error_strings, + + /* See ip/lookup.h */ + .n_next_nodes = IP_LOOKUP_N_NEXT, + .next_nodes = { + [IP_LOOKUP_NEXT_MISS] = "ip4-miss", + [IP_LOOKUP_NEXT_DROP] = "ip4-drop", + [IP_LOOKUP_NEXT_PUNT] = "ip4-punt", + [IP_LOOKUP_NEXT_LOCAL] = "ip4-local", + [IP_LOOKUP_NEXT_ARP] = "ip4-arp", + [IP_LOOKUP_NEXT_REWRITE] = "ip4-rewrite-transit", + [IP_LOOKUP_NEXT_CLASSIFY] = "ip4-classify", + [IP_LOOKUP_NEXT_MAP] = "ip4-map", + [IP_LOOKUP_NEXT_MAP_T] = "ip4-map-t", + [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd", + [IP_LOOKUP_NEXT_SIXRD] = "ip4-sixrd", + [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip4-hop-by-hop", /* probably not */ + [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip4-add-hop-by-hop", + [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip4-pop-hop-by-hop", + }, +}; + +static clib_error_t * +ip4_hop_by_hop_init (vlib_main_t * vm) +{ + ip4_hop_by_hop_main_t * hm = &ip4_hop_by_hop_main; + + hm->vlib_main = vm; + hm->vnet_main = vnet_get_main(); + + return 0; +} + +VLIB_INIT_FUNCTION (ip4_hop_by_hop_init); diff --git a/vnet/vnet/ip/ip4_input.c b/vnet/vnet/ip/ip4_input.c new file mode 100644 index 00000000000..68edc0fa918 --- /dev/null +++ b/vnet/vnet/ip/ip4_input.c @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4_input.c: IP v4 input node + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/ppp/ppp.h> +#include <vnet/hdlc/hdlc.h> + +typedef struct { + u8 packet_data[64]; +} ip4_input_trace_t; + +static u8 * format_ip4_input_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + ip4_input_trace_t * t = va_arg (*va, ip4_input_trace_t *); + + s = format (s, "%U", + format_ip4_header, + t->packet_data, sizeof (t->packet_data)); + + return s; +} + +typedef enum { + IP4_INPUT_NEXT_DROP, + IP4_INPUT_NEXT_PUNT, + IP4_INPUT_NEXT_LOOKUP, + IP4_INPUT_NEXT_LOOKUP_MULTICAST, + IP4_INPUT_NEXT_TTL_EXPIRE, + IP4_INPUT_N_NEXT, +} ip4_input_next_t; + +/* Validate IP v4 packets and pass them either to forwarding code + or drop/punt exception packets. */ +always_inline uword +ip4_input_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + int verify_checksum) +{ + ip4_main_t * im = &ip4_main; + vnet_main_t * vnm = vnet_get_main(); + ip_lookup_main_t * lm = &im->lookup_main; + u32 n_left_from, * from, * to_next; + ip4_input_next_t next_index; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index); + vlib_simple_counter_main_t * cm; + u32 cpu_index = os_get_cpu_number(); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (ip4_input_trace_t)); + + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_IP4); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t * p0, * p1; + ip4_header_t * ip0, * ip1; + ip_config_main_t * cm0, * cm1; + u32 sw_if_index0, pi0, ip_len0, cur_len0, next0; + u32 sw_if_index1, pi1, ip_len1, cur_len1, next1; + i32 len_diff0, len_diff1; + u8 error0, error1, cast0, cast1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (ip1[0]), LOAD); + } + + to_next[0] = pi0 = from[0]; + to_next[1] = pi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX]; + + cast0 = ip4_address_is_multicast (&ip0->dst_address) ? VNET_MULTICAST : VNET_UNICAST; + cast1 = ip4_address_is_multicast (&ip1->dst_address) ? VNET_MULTICAST : VNET_UNICAST; + + cm0 = lm->rx_config_mains + cast0; + cm1 = lm->rx_config_mains + cast1; + + vnet_buffer (p0)->ip.current_config_index = vec_elt (cm0->config_index_by_sw_if_index, sw_if_index0); + vnet_buffer (p1)->ip.current_config_index = vec_elt (cm1->config_index_by_sw_if_index, sw_if_index1); + + vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; + vnet_buffer (p1)->ip.adj_index[VLIB_RX] = ~0; + + vnet_get_config_data (&cm0->config_main, + &vnet_buffer (p0)->ip.current_config_index, + &next0, + /* # bytes of config data */ 0); + vnet_get_config_data (&cm1->config_main, + &vnet_buffer (p1)->ip.current_config_index, + &next1, + /* # bytes of config data */ 0); + + vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + + error0 = error1 = IP4_ERROR_NONE; + + /* Punt packets with options. */ + error0 = (ip0->ip_version_and_header_length & 0xf) != 5 ? IP4_ERROR_OPTIONS : error0; + error1 = (ip1->ip_version_and_header_length & 0xf) != 5 ? IP4_ERROR_OPTIONS : error1; + + /* Version != 4? Drop it. */ + error0 = (ip0->ip_version_and_header_length >> 4) != 4 ? IP4_ERROR_VERSION : error0; + error1 = (ip1->ip_version_and_header_length >> 4) != 4 ? IP4_ERROR_VERSION : error1; + + /* Verify header checksum. */ + if (verify_checksum) + { + ip_csum_t sum0, sum1; + + ip4_partial_header_checksum_x1 (ip0, sum0); + ip4_partial_header_checksum_x1 (ip1, sum1); + + error0 = 0xffff != ip_csum_fold (sum0) ? IP4_ERROR_BAD_CHECKSUM : error0; + error1 = 0xffff != ip_csum_fold (sum1) ? IP4_ERROR_BAD_CHECKSUM : error1; + } + + /* Drop fragmentation offset 1 packets. */ + error0 = ip4_get_fragment_offset (ip0) == 1 ? IP4_ERROR_FRAGMENT_OFFSET_ONE : error0; + error1 = ip4_get_fragment_offset (ip1) == 1 ? IP4_ERROR_FRAGMENT_OFFSET_ONE : error1; + + /* TTL <= 1? Drop it. */ + error0 = (ip0->ttl <= 1 && cast0 == VNET_UNICAST) ? IP4_ERROR_TIME_EXPIRED : error0; + error1 = (ip1->ttl <= 1 && cast1 == VNET_UNICAST) ? IP4_ERROR_TIME_EXPIRED : error1; + + /* Verify lengths. */ + ip_len0 = clib_net_to_host_u16 (ip0->length); + ip_len1 = clib_net_to_host_u16 (ip1->length); + + /* IP length must be at least minimal IP header. */ + error0 = ip_len0 < sizeof (ip0[0]) ? IP4_ERROR_TOO_SHORT : error0; + error1 = ip_len1 < sizeof (ip1[0]) ? IP4_ERROR_TOO_SHORT : error1; + + cur_len0 = vlib_buffer_length_in_chain (vm, p0); + cur_len1 = vlib_buffer_length_in_chain (vm, p1); + + len_diff0 = cur_len0 - ip_len0; + len_diff1 = cur_len1 - ip_len1; + + error0 = len_diff0 < 0 ? IP4_ERROR_BAD_LENGTH : error0; + error1 = len_diff1 < 0 ? IP4_ERROR_BAD_LENGTH : error1; + + p0->error = error_node->errors[error0]; + p1->error = error_node->errors[error1]; + + if (PREDICT_FALSE(error0 != IP4_ERROR_NONE)) + { + next0 = (error0 != IP4_ERROR_OPTIONS + ? (error0 == IP4_ERROR_TIME_EXPIRED + ? IP4_INPUT_NEXT_TTL_EXPIRE + : IP4_INPUT_NEXT_DROP) + : IP4_INPUT_NEXT_PUNT); + } + if (PREDICT_FALSE(error1 != IP4_ERROR_NONE)) + { + next1 = (error1 != IP4_ERROR_OPTIONS + ? (error1 == IP4_ERROR_TIME_EXPIRED + ? IP4_INPUT_NEXT_TTL_EXPIRE + : IP4_INPUT_NEXT_DROP) + : IP4_INPUT_NEXT_PUNT); + } + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + ip_config_main_t * cm0; + u32 sw_if_index0, pi0, ip_len0, cur_len0, next0; + i32 len_diff0; + u8 error0, cast0; + + pi0 = from[0]; + to_next[0] = pi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + ip0 = vlib_buffer_get_current (p0); + + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + + cast0 = ip4_address_is_multicast (&ip0->dst_address) ? VNET_MULTICAST : VNET_UNICAST; + cm0 = lm->rx_config_mains + cast0; + vnet_buffer (p0)->ip.current_config_index = vec_elt (cm0->config_index_by_sw_if_index, sw_if_index0); + vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; + vnet_get_config_data (&cm0->config_main, + &vnet_buffer (p0)->ip.current_config_index, + &next0, + /* # bytes of config data */ 0); + + vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + + error0 = IP4_ERROR_NONE; + + /* Punt packets with options. */ + error0 = (ip0->ip_version_and_header_length & 0xf) != 5 ? IP4_ERROR_OPTIONS : error0; + + /* Version != 4? Drop it. */ + error0 = (ip0->ip_version_and_header_length >> 4) != 4 ? IP4_ERROR_VERSION : error0; + + /* Verify header checksum. */ + if (verify_checksum) + { + ip_csum_t sum0; + + ip4_partial_header_checksum_x1 (ip0, sum0); + error0 = 0xffff != ip_csum_fold (sum0) ? IP4_ERROR_BAD_CHECKSUM : error0; + } + + /* Drop fragmentation offset 1 packets. */ + error0 = ip4_get_fragment_offset (ip0) == 1 ? IP4_ERROR_FRAGMENT_OFFSET_ONE : error0; + + /* TTL <= 1? Drop it. */ + error0 = (ip0->ttl <= 1 && cast0 == VNET_UNICAST) ? IP4_ERROR_TIME_EXPIRED : error0; + + /* Verify lengths. */ + ip_len0 = clib_net_to_host_u16 (ip0->length); + + /* IP length must be at least minimal IP header. */ + error0 = ip_len0 < sizeof (ip0[0]) ? IP4_ERROR_TOO_SHORT : error0; + + cur_len0 = vlib_buffer_length_in_chain (vm, p0); + len_diff0 = cur_len0 - ip_len0; + error0 = len_diff0 < 0 ? IP4_ERROR_BAD_LENGTH : error0; + + p0->error = error_node->errors[error0]; + if (PREDICT_FALSE(error0 != IP4_ERROR_NONE)) + { + next0 = (error0 != IP4_ERROR_OPTIONS + ? (error0 == IP4_ERROR_TIME_EXPIRED + ? IP4_INPUT_NEXT_TTL_EXPIRE + : IP4_INPUT_NEXT_DROP) + : IP4_INPUT_NEXT_PUNT); + } + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static uword +ip4_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip4_input_inline (vm, node, frame, /* verify_checksum */ 1); +} + +static uword +ip4_input_no_checksum (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip4_input_inline (vm, node, frame, /* verify_checksum */ 0); +} + +static char * ip4_error_strings[] = { +#define _(sym,string) string, + foreach_ip4_error +#undef _ +}; + +VLIB_REGISTER_NODE (ip4_input_node) = { + .function = ip4_input, + .name = "ip4-input", + .vector_size = sizeof (u32), + + .n_errors = IP4_N_ERROR, + .error_strings = ip4_error_strings, + + .n_next_nodes = IP4_INPUT_N_NEXT, + .next_nodes = { + [IP4_INPUT_NEXT_DROP] = "error-drop", + [IP4_INPUT_NEXT_PUNT] = "error-punt", + [IP4_INPUT_NEXT_LOOKUP] = "ip4-lookup", + [IP4_INPUT_NEXT_LOOKUP_MULTICAST] = "ip4-lookup-multicast", + [IP4_INPUT_NEXT_TTL_EXPIRE] = "ip4-icmp-ttl-expire", + }, + + .format_buffer = format_ip4_header, + .format_trace = format_ip4_input_trace, +}; + +VLIB_REGISTER_NODE (ip4_input_no_checksum_node,static) = { + .function = ip4_input_no_checksum, + .name = "ip4-input-no-checksum", + .vector_size = sizeof (u32), + + .n_next_nodes = IP4_INPUT_N_NEXT, + .next_nodes = { + [IP4_INPUT_NEXT_DROP] = "error-drop", + [IP4_INPUT_NEXT_PUNT] = "error-punt", + [IP4_INPUT_NEXT_LOOKUP] = "ip4-lookup", + [IP4_INPUT_NEXT_LOOKUP_MULTICAST] = "ip4-lookup-multicast", + [IP4_INPUT_NEXT_TTL_EXPIRE] = "ip4-icmp-ttl-expire", + }, + + .format_buffer = format_ip4_header, + .format_trace = format_ip4_input_trace, +}; + +static clib_error_t * ip4_init (vlib_main_t * vm) +{ + clib_error_t * error; + + ethernet_register_input_type (vm, ETHERNET_TYPE_IP4, + ip4_input_node.index); + ppp_register_input_protocol (vm, PPP_PROTOCOL_ip4, + ip4_input_node.index); + hdlc_register_input_protocol (vm, HDLC_PROTOCOL_ip4, + ip4_input_node.index); + + { + pg_node_t * pn; + pn = pg_get_node (ip4_input_node.index); + pn->unformat_edit = unformat_pg_ip4_header; + pn = pg_get_node (ip4_input_no_checksum_node.index); + pn->unformat_edit = unformat_pg_ip4_header; + } + + if ((error = vlib_call_init_function (vm, ip4_cli_init))) + return error; + + if ((error = vlib_call_init_function (vm, ip4_source_check_init))) + return error; + + /* Set flow hash to something non-zero. */ + ip4_main.flow_hash_seed = 0xdeadbeef; + + /* Default TTL for packets we generate. */ + ip4_main.host_config.ttl = 64; + + return error; +} + +VLIB_INIT_FUNCTION (ip4_init); diff --git a/vnet/vnet/ip/ip4_mtrie.c b/vnet/vnet/ip/ip4_mtrie.c new file mode 100644 index 00000000000..ed4a0d9f44f --- /dev/null +++ b/vnet/vnet/ip/ip4_mtrie.c @@ -0,0 +1,561 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4_fib.h: ip4 mtrie fib + * + * Copyright (c) 2012 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +static void +ply_init (ip4_fib_mtrie_ply_t * p, ip4_fib_mtrie_leaf_t init, uword prefix_len) +{ + p->n_non_empty_leafs = ip4_fib_mtrie_leaf_is_empty (init) ? 0 : ARRAY_LEN (p->leaves); + memset (p->dst_address_bits_of_leaves, prefix_len, sizeof (p->dst_address_bits_of_leaves)); + + /* Initialize leaves. */ +#ifdef CLIB_HAVE_VEC128 + { + u32x4 * l, init_x4; + +#ifndef __ALTIVEC__ + init_x4 = u32x4_splat (init); +#else + { + u32x4_union_t y; + y.as_u32[0] = init; + y.as_u32[1] = init; + y.as_u32[2] = init; + y.as_u32[3] = init; + init_x4 = y.as_u32x4; + } +#endif + + for (l = p->leaves_as_u32x4; l < p->leaves_as_u32x4 + ARRAY_LEN (p->leaves_as_u32x4); l += 4) + { + l[0] = init_x4; + l[1] = init_x4; + l[2] = init_x4; + l[3] = init_x4; + } + } +#else + { + u32 * l; + + for (l = p->leaves; l < p->leaves + ARRAY_LEN (p->leaves); l += 4) + { + l[0] = init; + l[1] = init; + l[2] = init; + l[3] = init; + } + } +#endif +} + +static ip4_fib_mtrie_leaf_t +ply_create (ip4_fib_mtrie_t * m, ip4_fib_mtrie_leaf_t init_leaf, uword prefix_len) +{ + ip4_fib_mtrie_ply_t * p; + + /* Get cache aligned ply. */ + pool_get_aligned (m->ply_pool, p, sizeof (p[0])); + + ply_init (p, init_leaf, prefix_len); + return ip4_fib_mtrie_leaf_set_next_ply_index (p - m->ply_pool); +} + +always_inline ip4_fib_mtrie_ply_t * +get_next_ply_for_leaf (ip4_fib_mtrie_t * m, ip4_fib_mtrie_leaf_t l) +{ + uword n = ip4_fib_mtrie_leaf_get_next_ply_index (l); + /* It better not be the root ply. */ + ASSERT (n != 0); + return pool_elt_at_index (m->ply_pool, n); +} + +static void +ply_free (ip4_fib_mtrie_t * m, ip4_fib_mtrie_ply_t * p) +{ + uword i, is_root; + + is_root = p - m->ply_pool == 0; + + for (i = 0 ; i < ARRAY_LEN (p->leaves); i++) + { + ip4_fib_mtrie_leaf_t l = p->leaves[i]; + if (ip4_fib_mtrie_leaf_is_next_ply (l)) + ply_free (m, get_next_ply_for_leaf (m, l)); + } + + if (is_root) + ply_init (p, IP4_FIB_MTRIE_LEAF_EMPTY, /* prefix_len */ 0); + else + pool_put (m->ply_pool, p); +} + +void ip4_fib_free (ip4_fib_mtrie_t * m) +{ + ip4_fib_mtrie_ply_t * root_ply = pool_elt_at_index (m->ply_pool, 0); + ply_free (m, root_ply); +} + +u32 ip4_mtrie_lookup_address (ip4_fib_mtrie_t * m, ip4_address_t dst) +{ + ip4_fib_mtrie_ply_t * p = pool_elt_at_index (m->ply_pool, 0); + ip4_fib_mtrie_leaf_t l; + + l = p->leaves[dst.as_u8[0]]; + if (ip4_fib_mtrie_leaf_is_terminal (l)) + return ip4_fib_mtrie_leaf_get_adj_index (l); + + p = get_next_ply_for_leaf (m, l); + l = p->leaves[dst.as_u8[1]]; + if (ip4_fib_mtrie_leaf_is_terminal (l)) + return ip4_fib_mtrie_leaf_get_adj_index (l); + + p = get_next_ply_for_leaf (m, l); + l = p->leaves[dst.as_u8[2]]; + if (ip4_fib_mtrie_leaf_is_terminal (l)) + return ip4_fib_mtrie_leaf_get_adj_index (l); + + p = get_next_ply_for_leaf (m, l); + l = p->leaves[dst.as_u8[3]]; + + ASSERT (ip4_fib_mtrie_leaf_is_terminal (l)); + return ip4_fib_mtrie_leaf_get_adj_index (l); +} + +typedef struct { + ip4_address_t dst_address; + u32 dst_address_length; + u32 adj_index; +} ip4_fib_mtrie_set_unset_leaf_args_t; + +static void +set_ply_with_more_specific_leaf (ip4_fib_mtrie_t * m, + ip4_fib_mtrie_ply_t * ply, + ip4_fib_mtrie_leaf_t new_leaf, + uword new_leaf_dst_address_bits) +{ + ip4_fib_mtrie_leaf_t old_leaf; + uword i; + + ASSERT (ip4_fib_mtrie_leaf_is_terminal (new_leaf)); + ASSERT (! ip4_fib_mtrie_leaf_is_empty (new_leaf)); + + for (i = 0; i < ARRAY_LEN (ply->leaves); i++) + { + old_leaf = ply->leaves[i]; + + /* Recurse into sub plies. */ + if (! ip4_fib_mtrie_leaf_is_terminal (old_leaf)) + { + ip4_fib_mtrie_ply_t * sub_ply = get_next_ply_for_leaf (m, old_leaf); + set_ply_with_more_specific_leaf (m, sub_ply, new_leaf, new_leaf_dst_address_bits); + } + + /* Replace less specific terminal leaves with new leaf. */ + else if (new_leaf_dst_address_bits >= ply->dst_address_bits_of_leaves[i]) + { + ply->leaves[i] = new_leaf; + ply->dst_address_bits_of_leaves[i] = new_leaf_dst_address_bits; + ply->n_non_empty_leafs += ip4_fib_mtrie_leaf_is_empty (old_leaf); + } + } +} + +static void +set_leaf (ip4_fib_mtrie_t * m, + ip4_fib_mtrie_set_unset_leaf_args_t * a, + u32 old_ply_index, + u32 dst_address_byte_index) +{ + ip4_fib_mtrie_leaf_t old_leaf, new_leaf; + i32 n_dst_bits_next_plies; + u8 dst_byte; + + ASSERT (a->dst_address_length > 0 && a->dst_address_length <= 32); + ASSERT (dst_address_byte_index < ARRAY_LEN (a->dst_address.as_u8)); + + n_dst_bits_next_plies = a->dst_address_length - BITS (u8) * (dst_address_byte_index + 1); + + dst_byte = a->dst_address.as_u8[dst_address_byte_index]; + + /* Number of bits next plies <= 0 => insert leaves this ply. */ + if (n_dst_bits_next_plies <= 0) + { + uword i, n_dst_bits_this_ply, old_leaf_is_terminal; + + n_dst_bits_this_ply = -n_dst_bits_next_plies; + ASSERT ((a->dst_address.as_u8[dst_address_byte_index] & pow2_mask (n_dst_bits_this_ply)) == 0); + + for (i = dst_byte; i < dst_byte + (1 << n_dst_bits_this_ply); i++) + { + ip4_fib_mtrie_ply_t * old_ply, * new_ply; + + old_ply = pool_elt_at_index (m->ply_pool, old_ply_index); + + old_leaf = old_ply->leaves[i]; + old_leaf_is_terminal = ip4_fib_mtrie_leaf_is_terminal (old_leaf); + + /* Is leaf to be inserted more specific? */ + if (a->dst_address_length >= old_ply->dst_address_bits_of_leaves[i]) + { + new_leaf = ip4_fib_mtrie_leaf_set_adj_index (a->adj_index); + + if (old_leaf_is_terminal) + { + old_ply->dst_address_bits_of_leaves[i] = a->dst_address_length; + old_ply->leaves[i] = new_leaf; + old_ply->n_non_empty_leafs += ip4_fib_mtrie_leaf_is_empty (old_leaf); + ASSERT (old_ply->n_non_empty_leafs <= ARRAY_LEN (old_ply->leaves)); + } + else + { + /* Existing leaf points to another ply. We need to place new_leaf into all + more specific slots. */ + new_ply = get_next_ply_for_leaf (m, old_leaf); + set_ply_with_more_specific_leaf (m, new_ply, new_leaf, a->dst_address_length); + } + } + + else if (! old_leaf_is_terminal) + { + new_ply = get_next_ply_for_leaf (m, old_leaf); + set_leaf (m, a, new_ply - m->ply_pool, dst_address_byte_index + 1); + } + } + } + else + { + ip4_fib_mtrie_ply_t * old_ply, * new_ply; + + old_ply = pool_elt_at_index (m->ply_pool, old_ply_index); + old_leaf = old_ply->leaves[dst_byte]; + if (ip4_fib_mtrie_leaf_is_terminal (old_leaf)) + { + new_leaf = ply_create (m, old_leaf, old_ply->dst_address_bits_of_leaves[dst_byte]); + new_ply = get_next_ply_for_leaf (m, new_leaf); + + /* Refetch since ply_create may move pool. */ + old_ply = pool_elt_at_index (m->ply_pool, old_ply_index); + + old_ply->leaves[dst_byte] = new_leaf; + old_ply->dst_address_bits_of_leaves[dst_byte] = 0; + + old_ply->n_non_empty_leafs -= ip4_fib_mtrie_leaf_is_non_empty (old_leaf); + ASSERT (old_ply->n_non_empty_leafs >= 0); + + /* Account for the ply we just created. */ + old_ply->n_non_empty_leafs += 1; + } + else + new_ply = get_next_ply_for_leaf (m, old_leaf); + + set_leaf (m, a, new_ply - m->ply_pool, dst_address_byte_index + 1); + } +} + +static uword +unset_leaf (ip4_fib_mtrie_t * m, + ip4_fib_mtrie_set_unset_leaf_args_t * a, + ip4_fib_mtrie_ply_t * old_ply, + u32 dst_address_byte_index) +{ + ip4_fib_mtrie_leaf_t old_leaf, del_leaf; + i32 n_dst_bits_next_plies; + uword i, n_dst_bits_this_ply, old_leaf_is_terminal; + u8 dst_byte; + + ASSERT (a->dst_address_length > 0 && a->dst_address_length <= 32); + ASSERT (dst_address_byte_index < ARRAY_LEN (a->dst_address.as_u8)); + + n_dst_bits_next_plies = a->dst_address_length - BITS (u8) * (dst_address_byte_index + 1); + + dst_byte = a->dst_address.as_u8[dst_address_byte_index]; + if (n_dst_bits_next_plies < 0) + dst_byte &= ~pow2_mask (-n_dst_bits_next_plies); + + n_dst_bits_this_ply = n_dst_bits_next_plies <= 0 ? -n_dst_bits_next_plies : 0; + n_dst_bits_this_ply = clib_min (8, n_dst_bits_this_ply); + + del_leaf = ip4_fib_mtrie_leaf_set_adj_index (a->adj_index); + + for (i = dst_byte; i < dst_byte + (1 << n_dst_bits_this_ply); i++) + { + old_leaf = old_ply->leaves[i]; + old_leaf_is_terminal = ip4_fib_mtrie_leaf_is_terminal (old_leaf); + + if (old_leaf == del_leaf + || (! old_leaf_is_terminal + && unset_leaf (m, a, get_next_ply_for_leaf (m, old_leaf), dst_address_byte_index + 1))) + { + old_ply->leaves[i] = IP4_FIB_MTRIE_LEAF_EMPTY; + old_ply->dst_address_bits_of_leaves[i] = 0; + + /* No matter what we just deleted a non-empty leaf. */ + ASSERT (! ip4_fib_mtrie_leaf_is_empty (old_leaf)); + old_ply->n_non_empty_leafs -= 1; + + ASSERT (old_ply->n_non_empty_leafs >= 0); + if (old_ply->n_non_empty_leafs == 0 && dst_address_byte_index > 0) + { + pool_put (m->ply_pool, old_ply); + /* Old ply was deleted. */ + return 1; + } + } + } + + /* Old ply was not deleted. */ + return 0; +} + +void ip4_mtrie_init (ip4_fib_mtrie_t * m) +{ + ip4_fib_mtrie_leaf_t root; + memset (m, 0, sizeof (m[0])); + m->default_leaf = IP4_FIB_MTRIE_LEAF_EMPTY; + root = ply_create (m, IP4_FIB_MTRIE_LEAF_EMPTY, /* dst_address_bits_of_leaves */ 0); + ASSERT (ip4_fib_mtrie_leaf_get_next_ply_index (root) == 0); +} + +void +ip4_fib_mtrie_add_del_route (ip4_fib_t * fib, + ip4_address_t dst_address, + u32 dst_address_length, + u32 adj_index, + u32 is_del) +{ + ip4_fib_mtrie_t * m = &fib->mtrie; + ip4_fib_mtrie_ply_t * root_ply; + ip4_fib_mtrie_set_unset_leaf_args_t a; + ip4_main_t * im = &ip4_main; + + ASSERT(m->ply_pool != 0); + + root_ply = pool_elt_at_index (m->ply_pool, 0); + + /* Honor dst_address_length. Fib masks are in network byte order */ + dst_address.as_u32 &= im->fib_masks[dst_address_length]; + a.dst_address = dst_address; + a.dst_address_length = dst_address_length; + a.adj_index = adj_index; + + if (! is_del) + { + if (dst_address_length == 0) + m->default_leaf = ip4_fib_mtrie_leaf_set_adj_index (adj_index); + else + set_leaf (m, &a, /* ply_index */ 0, /* dst_address_byte_index */ 0); + } + else + { + if (dst_address_length == 0) + m->default_leaf = IP4_FIB_MTRIE_LEAF_EMPTY; + + else + { + ip4_main_t * im = &ip4_main; + uword i; + + unset_leaf (m, &a, root_ply, 0); + + /* Find next less specific route and insert into mtrie. */ + for (i = ARRAY_LEN (fib->adj_index_by_dst_address) - 1; i >= 1; i--) + { + uword * p; + ip4_address_t key; + + if (! fib->adj_index_by_dst_address[i]) + continue; + + key.as_u32 = dst_address.as_u32 & im->fib_masks[i]; + p = hash_get (fib->adj_index_by_dst_address[i], key.as_u32); + if (p) + { + a.dst_address = key; + a.dst_address_length = i; + a.adj_index = p[0]; + set_leaf (m, &a, /* ply_index */ 0, /* dst_address_byte_index */ 0); + break; + } + } + } + } +} + +always_inline uword +maybe_remap_leaf (ip_lookup_main_t * lm, ip4_fib_mtrie_leaf_t * p) +{ + ip4_fib_mtrie_leaf_t l = p[0]; + uword was_remapped_to_empty_leaf = 0; + if (ip4_fib_mtrie_leaf_is_terminal (l)) + { + u32 adj_index = ip4_fib_mtrie_leaf_get_adj_index (l); + u32 m = vec_elt (lm->adjacency_remap_table, adj_index); + if (m) + { + was_remapped_to_empty_leaf = m == ~0; + if (was_remapped_to_empty_leaf) + p[0] = (was_remapped_to_empty_leaf + ? IP4_FIB_MTRIE_LEAF_EMPTY + : ip4_fib_mtrie_leaf_set_adj_index (m - 1)); + } + } + return was_remapped_to_empty_leaf; +} + +static void maybe_remap_ply (ip_lookup_main_t * lm, ip4_fib_mtrie_ply_t * ply) +{ + u32 n_remapped_to_empty = 0; + u32 i; + for (i = 0; i < ARRAY_LEN (ply->leaves); i++) + n_remapped_to_empty += maybe_remap_leaf (lm, &ply->leaves[i]); + if (n_remapped_to_empty > 0) + { + ASSERT (n_remapped_to_empty <= ply->n_non_empty_leafs); + ply->n_non_empty_leafs -= n_remapped_to_empty; + if (ply->n_non_empty_leafs == 0) + os_panic (); + } +} + +void ip4_mtrie_maybe_remap_adjacencies (ip_lookup_main_t * lm, ip4_fib_mtrie_t * m) +{ + ip4_fib_mtrie_ply_t * ply; + pool_foreach (ply, m->ply_pool, maybe_remap_ply (lm, ply)); + maybe_remap_leaf (lm, &m->default_leaf); +} + +/* Returns number of bytes of memory used by mtrie. */ +static uword mtrie_memory_usage (ip4_fib_mtrie_t * m, ip4_fib_mtrie_ply_t * p) +{ + uword bytes, i; + + if (! p) + { + if (pool_is_free_index (m->ply_pool, 0)) + return 0; + p = pool_elt_at_index (m->ply_pool, 0); + } + + bytes = sizeof (p[0]); + for (i = 0 ; i < ARRAY_LEN (p->leaves); i++) + { + ip4_fib_mtrie_leaf_t l = p->leaves[i]; + if (ip4_fib_mtrie_leaf_is_next_ply (l)) + bytes += mtrie_memory_usage (m, get_next_ply_for_leaf (m, l)); + } + + return bytes; +} + +static u8 * format_ip4_fib_mtrie_leaf (u8 * s, va_list * va) +{ + ip4_fib_mtrie_leaf_t l = va_arg (*va, ip4_fib_mtrie_leaf_t); + + if (ip4_fib_mtrie_leaf_is_empty (l)) + s = format (s, "miss"); + else if (ip4_fib_mtrie_leaf_is_terminal (l)) + s = format (s, "adj %d", ip4_fib_mtrie_leaf_get_adj_index (l)); + else + s = format (s, "next ply %d", ip4_fib_mtrie_leaf_get_next_ply_index (l)); + return s; +} + +static u8 * format_ip4_fib_mtrie_ply (u8 * s, va_list * va) +{ + ip4_fib_mtrie_t * m = va_arg (*va, ip4_fib_mtrie_t *); + u32 base_address = va_arg (*va, u32); + u32 ply_index = va_arg (*va, u32); + u32 dst_address_byte_index = va_arg (*va, u32); + ip4_fib_mtrie_ply_t * p; + uword i, indent; + + p = pool_elt_at_index (m->ply_pool, ply_index); + indent = format_get_indent (s); + s = format (s, "ply index %d, %d non-empty leaves", ply_index, p->n_non_empty_leafs); + for (i = 0; i < ARRAY_LEN (p->leaves); i++) + { + ip4_fib_mtrie_leaf_t l = p->leaves[i]; + + if (! ip4_fib_mtrie_leaf_is_empty (l)) + { + u32 a, ia_length; + ip4_address_t ia; + + a = base_address + (i << (24 - 8*dst_address_byte_index)); + ia.as_u32 = clib_host_to_net_u32 (a); + if (ip4_fib_mtrie_leaf_is_terminal (l)) + ia_length = p->dst_address_bits_of_leaves[i]; + else + ia_length = 8*(1 + dst_address_byte_index); + s = format (s, "\n%U%20U %U", + format_white_space, indent + 2, + format_ip4_address_and_length, &ia, ia_length, + format_ip4_fib_mtrie_leaf, l); + + if (ip4_fib_mtrie_leaf_is_next_ply (l)) + s = format (s, "\n%U%U", + format_white_space, indent + 2, + format_ip4_fib_mtrie_ply, m, a, + ip4_fib_mtrie_leaf_get_next_ply_index (l), + dst_address_byte_index + 1); + } + } + + return s; +} + +u8 * format_ip4_fib_mtrie (u8 * s, va_list * va) +{ + ip4_fib_mtrie_t * m = va_arg (*va, ip4_fib_mtrie_t *); + + s = format (s, "%d plies, memory usage %U", + pool_elts (m->ply_pool), + format_memory_size, mtrie_memory_usage (m, 0)); + + if (pool_elts (m->ply_pool) > 0) + { + ip4_address_t base_address; + base_address.as_u32 = 0; + s = format (s, "\n %U", format_ip4_fib_mtrie_ply, m, base_address, 0, 0); + } + + return s; +} diff --git a/vnet/vnet/ip/ip4_mtrie.h b/vnet/vnet/ip/ip4_mtrie.h new file mode 100644 index 00000000000..31de41e14fa --- /dev/null +++ b/vnet/vnet/ip/ip4_mtrie.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4_fib.h: ip4 mtrie fib + * + * Copyright (c) 2012 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip_ip4_fib_h +#define included_ip_ip4_fib_h + +#include <vppinfra/cache.h> +#include <vppinfra/vector.h> +#include <vnet/ip/lookup.h> +#include <vnet/ip/ip4_packet.h> /* for ip4_address_t */ + +/* ip4 fib leafs: 4 ply 8-8-8-8 mtrie. + 1 + 2*adj_index for terminal leaves. + 0 + 2*next_ply_index for non-terminals. + 1 => empty (adjacency index of zero is special miss adjacency). */ +typedef u32 ip4_fib_mtrie_leaf_t; + +#define IP4_FIB_MTRIE_LEAF_EMPTY (1 + 2*IP_LOOKUP_MISS_ADJ_INDEX) +#define IP4_FIB_MTRIE_LEAF_ROOT (0 + 2*0) + +always_inline u32 ip4_fib_mtrie_leaf_is_empty (ip4_fib_mtrie_leaf_t n) +{ return n == IP4_FIB_MTRIE_LEAF_EMPTY; } + +always_inline u32 ip4_fib_mtrie_leaf_is_non_empty (ip4_fib_mtrie_leaf_t n) +{ return n != IP4_FIB_MTRIE_LEAF_EMPTY; } + +always_inline u32 ip4_fib_mtrie_leaf_is_terminal (ip4_fib_mtrie_leaf_t n) +{ return n & 1; } + +always_inline u32 ip4_fib_mtrie_leaf_get_adj_index (ip4_fib_mtrie_leaf_t n) +{ + ASSERT (ip4_fib_mtrie_leaf_is_terminal (n)); + return n >> 1; +} + +always_inline ip4_fib_mtrie_leaf_t ip4_fib_mtrie_leaf_set_adj_index (u32 adj_index) +{ + ip4_fib_mtrie_leaf_t l; + l = 1 + 2*adj_index; + ASSERT (ip4_fib_mtrie_leaf_get_adj_index (l) == adj_index); + return l; +} + +always_inline u32 ip4_fib_mtrie_leaf_is_next_ply (ip4_fib_mtrie_leaf_t n) +{ return (n & 1) == 0; } + +always_inline u32 ip4_fib_mtrie_leaf_get_next_ply_index (ip4_fib_mtrie_leaf_t n) +{ + ASSERT (ip4_fib_mtrie_leaf_is_next_ply (n)); + return n >> 1; +} + +always_inline ip4_fib_mtrie_leaf_t ip4_fib_mtrie_leaf_set_next_ply_index (u32 i) +{ + ip4_fib_mtrie_leaf_t l; + l = 0 + 2*i; + ASSERT (ip4_fib_mtrie_leaf_get_next_ply_index (l) == i); + return l; +} + +/* One ply of the 4 ply mtrie fib. */ +typedef struct { + union { + ip4_fib_mtrie_leaf_t leaves[256]; + +#ifdef CLIB_HAVE_VEC128 + u32x4 leaves_as_u32x4[256 / 4]; +#endif + }; + + /* Prefix length for terminal leaves. */ + u8 dst_address_bits_of_leaves[256]; + + /* Number of non-empty leafs (whether terminal or not). */ + i32 n_non_empty_leafs; + + /* Pad to cache line boundary. */ + u8 pad[CLIB_CACHE_LINE_BYTES + - 1 * sizeof (i32)]; +} ip4_fib_mtrie_ply_t; + +typedef struct { + /* Pool of plies. Index zero is root ply. */ + ip4_fib_mtrie_ply_t * ply_pool; + + /* Special case leaf for default route 0.0.0.0/0. */ + ip4_fib_mtrie_leaf_t default_leaf; +} ip4_fib_mtrie_t; + +void ip4_fib_mtrie_init (ip4_fib_mtrie_t * m); + +struct ip4_fib_t; + +void ip4_fib_mtrie_add_del_route (struct ip4_fib_t * f, + ip4_address_t dst_address, + u32 dst_address_length, + u32 adj_index, + u32 is_del); + +/* Returns adjacency index. */ +u32 ip4_mtrie_lookup_address (ip4_fib_mtrie_t * m, ip4_address_t dst); + +void ip4_mtrie_maybe_remap_adjacencies (ip_lookup_main_t * lm, ip4_fib_mtrie_t * m); + +format_function_t format_ip4_fib_mtrie; + +/* Lookup step. Processes 1 byte of 4 byte ip4 address. */ +always_inline ip4_fib_mtrie_leaf_t +ip4_fib_mtrie_lookup_step (ip4_fib_mtrie_t * m, + ip4_fib_mtrie_leaf_t current_leaf, + ip4_address_t * dst_address, + u32 dst_address_byte_index) +{ + ip4_fib_mtrie_leaf_t next_leaf; + ip4_fib_mtrie_ply_t * ply; + uword current_is_terminal = ip4_fib_mtrie_leaf_is_terminal (current_leaf); + + ply = m->ply_pool + (current_is_terminal ? 0 : (current_leaf >> 1)); + next_leaf = ply->leaves[dst_address->as_u8[dst_address_byte_index]]; + next_leaf = current_is_terminal ? current_leaf : next_leaf; + + return next_leaf; +} + +#endif /* included_ip_ip4_fib_h */ diff --git a/vnet/vnet/ip/ip4_packet.h b/vnet/vnet/ip/ip4_packet.h new file mode 100644 index 00000000000..69467eb4e03 --- /dev/null +++ b/vnet/vnet/ip/ip4_packet.h @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip4/packet.h: ip4 packet format + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip4_packet_h +#define included_ip4_packet_h + +#include <vnet/ip/ip_packet.h> /* for ip_csum_t */ +#include <vnet/ip/tcp_packet.h> /* for tcp_header_t */ +#include <vppinfra/byte_order.h> /* for clib_net_to_host_u16 */ + +/* IP4 address which can be accessed either as 4 bytes + or as a 32-bit number. */ +typedef union { + u8 data[4]; + u32 data_u32; + /* Aliases. */ + u8 as_u8[4]; + u32 as_u32; +} ip4_address_t; + +typedef struct { + /* IP address must be first for ip_interface_address_get_address() to work */ + ip4_address_t ip4_addr; + u32 fib_index; +} ip4_address_fib_t; + +always_inline void +ip4_addr_fib_init (ip4_address_fib_t * addr_fib, ip4_address_t * address, + u32 fib_index) +{ + memcpy (&addr_fib->ip4_addr, address, sizeof (addr_fib->ip4_addr)); + addr_fib->fib_index = fib_index; +} + +/* (src,dst) pair of addresses as found in packet header. */ +typedef struct { + ip4_address_t src, dst; +} ip4_address_pair_t; + +/* If address is a valid netmask, return length of mask. */ +always_inline uword +ip4_address_netmask_length (ip4_address_t * a) +{ + uword result = 0; + uword i; + for (i = 0; i < ARRAY_LEN (a->as_u8); i++) + { + switch (a->as_u8[i]) + { + case 0xff: result += 8; break; + case 0xfe: result += 7; goto done; + case 0xfc: result += 6; goto done; + case 0xf8: result += 5; goto done; + case 0xf0: result += 4; goto done; + case 0xe0: result += 3; goto done; + case 0xc0: result += 2; goto done; + case 0x80: result += 1; goto done; + case 0x00: result += 0; goto done; + default: + /* Not a valid netmask mask. */ + return ~0; + } + } + done: + return result; +} + +typedef union { + struct { + /* 4 bit packet length (in 32bit units) and version VVVVLLLL. + e.g. for packets w/ no options ip_version_and_header_length == 0x45. */ + u8 ip_version_and_header_length; + + /* Type of service. */ + u8 tos; + + /* Total layer 3 packet length including this header. */ + u16 length; + + /* Fragmentation ID. */ + u16 fragment_id; + + /* 3 bits of flags and 13 bits of fragment offset (in units + of 8 byte quantities). */ + u16 flags_and_fragment_offset; +#define IP4_HEADER_FLAG_MORE_FRAGMENTS (1 << 13) +#define IP4_HEADER_FLAG_DONT_FRAGMENT (1 << 14) +#define IP4_HEADER_FLAG_CONGESTION (1 << 15) + + /* Time to live decremented by router at each hop. */ + u8 ttl; + + /* Next level protocol packet. */ + u8 protocol; + + /* Checksum. */ + u16 checksum; + + /* Source and destination address. */ + union { + struct { + ip4_address_t src_address, dst_address; + }; + ip4_address_pair_t address_pair; + }; + }; + + /* For checksumming we'll want to access IP header in word sized chunks. */ + /* For 64 bit machines. */ + CLIB_PACKED (struct { + u64 checksum_data_64[2]; + u32 checksum_data_64_32[1]; + }); + + /* For 32 bit machines. */ + CLIB_PACKED (struct { + u32 checksum_data_32[5]; + }); +} ip4_header_t; + +/* Value of ip_version_and_header_length for packets w/o options. */ +#define IP4_VERSION_AND_HEADER_LENGTH_NO_OPTIONS \ + ((4 << 4) | (sizeof (ip4_header_t) / sizeof (u32))) + +always_inline int +ip4_get_fragment_offset (ip4_header_t * i) +{ return clib_net_to_host_u16 (i->flags_and_fragment_offset) & 0x1fff; } + +always_inline int +ip4_get_fragment_more (ip4_header_t * i) +{ return clib_net_to_host_u16 (i->flags_and_fragment_offset) & IP4_HEADER_FLAG_MORE_FRAGMENTS; } + +always_inline int +ip4_is_fragment (ip4_header_t * i) +{ return (i->flags_and_fragment_offset & + clib_net_to_host_u16 (0x1fff | IP4_HEADER_FLAG_MORE_FRAGMENTS)); } + +always_inline int +ip4_is_first_fragment (ip4_header_t * i) +{ return (i->flags_and_fragment_offset & + clib_net_to_host_u16 (0x1fff | IP4_HEADER_FLAG_MORE_FRAGMENTS)) == + clib_net_to_host_u16 (IP4_HEADER_FLAG_MORE_FRAGMENTS); } + +/* Fragment offset in bytes. */ +always_inline int +ip4_get_fragment_offset_bytes (ip4_header_t * i) +{ return 8 * ip4_get_fragment_offset (i); } + +always_inline int +ip4_header_bytes (ip4_header_t * i) +{ return sizeof (u32) * (i->ip_version_and_header_length & 0xf); } + +always_inline void * +ip4_next_header (ip4_header_t * i) +{ return (void *) i + ip4_header_bytes (i); } + +always_inline u16 +ip4_header_checksum (ip4_header_t * i) +{ + u16 save, csum; + ip_csum_t sum; + + save = i->checksum; + i->checksum = 0; + sum = ip_incremental_checksum (0, i, ip4_header_bytes (i)); + csum = ~ip_csum_fold (sum); + + i->checksum = save; + + /* Make checksum agree for special case where either + 0 or 0xffff would give same 1s complement sum. */ + if (csum == 0 && save == 0xffff) + csum = save; + + return csum; +} + +static inline uword +ip4_header_checksum_is_valid (ip4_header_t * i) +{ return i->checksum == ip4_header_checksum (i); } + +#define ip4_partial_header_checksum_x1(ip0,sum0) \ +do { \ + if (BITS (ip_csum_t) > 32) \ + { \ + sum0 = ip0->checksum_data_64[0]; \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_64[1]); \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_64_32[0]); \ + } \ + else \ + { \ + sum0 = ip0->checksum_data_32[0]; \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[1]); \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[2]); \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[3]); \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[4]); \ + } \ +} while (0) + +#define ip4_partial_header_checksum_x2(ip0,ip1,sum0,sum1) \ +do { \ + if (BITS (ip_csum_t) > 32) \ + { \ + sum0 = ip0->checksum_data_64[0]; \ + sum1 = ip1->checksum_data_64[0]; \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_64[1]); \ + sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_64[1]); \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_64_32[0]); \ + sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_64_32[0]); \ + } \ + else \ + { \ + sum0 = ip0->checksum_data_32[0]; \ + sum1 = ip1->checksum_data_32[0]; \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[1]); \ + sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_32[1]); \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[2]); \ + sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_32[2]); \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[3]); \ + sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_32[3]); \ + sum0 = ip_csum_with_carry (sum0, ip0->checksum_data_32[4]); \ + sum1 = ip_csum_with_carry (sum1, ip1->checksum_data_32[4]); \ + } \ +} while (0) + +always_inline uword +ip4_address_is_multicast (ip4_address_t * a) +{ return (a->data[0] & 0xf0) == 0xe0; } + +always_inline void +ip4_multicast_address_set_for_group (ip4_address_t * a, ip_multicast_group_t g) +{ + ASSERT (g < (1 << 28)); + a->as_u32 = clib_host_to_net_u32 ((0xe << 28) + g); +} + +always_inline void +ip4_tcp_reply_x1 (ip4_header_t * ip0, tcp_header_t * tcp0) +{ + u32 src0, dst0; + + src0 = ip0->src_address.data_u32; + dst0 = ip0->dst_address.data_u32; + ip0->src_address.data_u32 = dst0; + ip0->dst_address.data_u32 = src0; + + src0 = tcp0->ports.src; + dst0 = tcp0->ports.dst; + tcp0->ports.src = dst0; + tcp0->ports.dst = src0; +} + +always_inline void +ip4_tcp_reply_x2 (ip4_header_t * ip0, ip4_header_t * ip1, + tcp_header_t * tcp0, tcp_header_t * tcp1) +{ + u32 src0, dst0, src1, dst1; + + src0 = ip0->src_address.data_u32; + src1 = ip1->src_address.data_u32; + dst0 = ip0->dst_address.data_u32; + dst1 = ip1->dst_address.data_u32; + ip0->src_address.data_u32 = dst0; + ip1->src_address.data_u32 = dst1; + ip0->dst_address.data_u32 = src0; + ip1->dst_address.data_u32 = src1; + + src0 = tcp0->ports.src; + src1 = tcp1->ports.src; + dst0 = tcp0->ports.dst; + dst1 = tcp1->ports.dst; + tcp0->ports.src = dst0; + tcp1->ports.src = dst1; + tcp0->ports.dst = src0; + tcp1->ports.dst = src1; +} + +#endif /* included_ip4_packet_h */ diff --git a/vnet/vnet/ip/ip4_pg.c b/vnet/vnet/ip/ip4_pg.c new file mode 100644 index 00000000000..9710d8d4c5a --- /dev/null +++ b/vnet/vnet/ip/ip4_pg.c @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4_pg: IP v4 packet-generator interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> +#include <vnet/pg/pg.h> + +#define IP4_PG_EDIT_CHECKSUM (1 << 0) +#define IP4_PG_EDIT_LENGTH (1 << 1) + +static_always_inline void +compute_length_and_or_checksum (vlib_main_t * vm, + u32 * packets, + u32 n_packets, + u32 ip_header_offset, + u32 flags) +{ + ASSERT (flags != 0); + + while (n_packets >= 2) + { + u32 pi0, pi1; + vlib_buffer_t * p0, * p1; + ip4_header_t * ip0, * ip1; + ip_csum_t sum0, sum1; + + pi0 = packets[0]; + pi1 = packets[1]; + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + n_packets -= 2; + packets += 2; + + ip0 = (void *) (p0->data + ip_header_offset); + ip1 = (void *) (p1->data + ip_header_offset); + + if (flags & IP4_PG_EDIT_LENGTH) + { + ip0->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p0) - ip_header_offset); + ip1->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p1) - ip_header_offset); + } + + if (flags & IP4_PG_EDIT_CHECKSUM) + { + ASSERT (ip4_header_bytes (ip0) == sizeof (ip0[0])); + ASSERT (ip4_header_bytes (ip1) == sizeof (ip1[0])); + + ip0->checksum = 0; + ip1->checksum = 0; + + ip4_partial_header_checksum_x2 (ip0, ip1, sum0, sum1); + ip0->checksum = ~ ip_csum_fold (sum0); + ip1->checksum = ~ ip_csum_fold (sum1); + + ASSERT (ip0->checksum == ip4_header_checksum (ip0)); + ASSERT (ip1->checksum == ip4_header_checksum (ip1)); + } + } + + while (n_packets >= 1) + { + u32 pi0; + vlib_buffer_t * p0; + ip4_header_t * ip0; + ip_csum_t sum0; + + pi0 = packets[0]; + p0 = vlib_get_buffer (vm, pi0); + n_packets -= 1; + packets += 1; + + ip0 = (void *) (p0->data + ip_header_offset); + + if (flags & IP4_PG_EDIT_LENGTH) + ip0->length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p0) - ip_header_offset); + + if (flags & IP4_PG_EDIT_CHECKSUM) + { + ASSERT (ip4_header_bytes (ip0) == sizeof (ip0[0])); + + ip0->checksum = 0; + + ip4_partial_header_checksum_x1 (ip0, sum0); + ip0->checksum = ~ ip_csum_fold (sum0); + + ASSERT (ip0->checksum == ip4_header_checksum (ip0)); + } + } +} + +static void +ip4_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, + u32 n_packets) +{ + vlib_main_t * vm = pg->vlib_main; + u32 ip_offset; + + ip_offset = g->start_byte_offset; + + switch (g->edit_function_opaque) + { + case IP4_PG_EDIT_LENGTH: + compute_length_and_or_checksum (vm, packets, n_packets, ip_offset, + IP4_PG_EDIT_LENGTH); + break; + + case IP4_PG_EDIT_CHECKSUM: + compute_length_and_or_checksum (vm, packets, n_packets, ip_offset, + IP4_PG_EDIT_CHECKSUM); + break; + + case IP4_PG_EDIT_LENGTH | IP4_PG_EDIT_CHECKSUM: + compute_length_and_or_checksum (vm, packets, n_packets, ip_offset, + IP4_PG_EDIT_LENGTH + | IP4_PG_EDIT_CHECKSUM); + break; + + default: + ASSERT (0); + break; + } +} + +typedef struct { + pg_edit_t ip_version, header_length; + pg_edit_t tos; + pg_edit_t length; + + pg_edit_t fragment_id, fragment_offset; + + /* Flags together with fragment offset. */ + pg_edit_t mf_flag, df_flag, ce_flag; + + pg_edit_t ttl; + + pg_edit_t protocol; + + pg_edit_t checksum; + + pg_edit_t src_address, dst_address; +} pg_ip4_header_t; + +static inline void +pg_ip4_header_init (pg_ip4_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, ip4_header_t, f); + _ (tos); + _ (length); + _ (fragment_id); + _ (ttl); + _ (protocol); + _ (checksum); + _ (src_address); + _ (dst_address); +#undef _ + + /* Initialize bit fields. */ + pg_edit_init_bitfield (&p->header_length, ip4_header_t, + ip_version_and_header_length, + 0, 4); + pg_edit_init_bitfield (&p->ip_version, ip4_header_t, + ip_version_and_header_length, + 4, 4); + + pg_edit_init_bitfield (&p->fragment_offset, ip4_header_t, + flags_and_fragment_offset, + 0, 13); + pg_edit_init_bitfield (&p->mf_flag, ip4_header_t, + flags_and_fragment_offset, + 13, 1); + pg_edit_init_bitfield (&p->df_flag, ip4_header_t, + flags_and_fragment_offset, + 14, 1); + pg_edit_init_bitfield (&p->ce_flag, ip4_header_t, + flags_and_fragment_offset, + 15, 1); +} + +uword +unformat_pg_ip4_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t * s = va_arg (*args, pg_stream_t *); + pg_ip4_header_t * p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (ip4_header_t), + &group_index); + pg_ip4_header_init (p); + + /* Defaults. */ + pg_edit_set_fixed (&p->ip_version, 4); + pg_edit_set_fixed (&p->header_length, + sizeof (ip4_header_t) / sizeof (u32)); + + pg_edit_set_fixed (&p->tos, 0); + pg_edit_set_fixed (&p->ttl, 64); + + pg_edit_set_fixed (&p->fragment_id, 0); + pg_edit_set_fixed (&p->fragment_offset, 0); + pg_edit_set_fixed (&p->mf_flag, 0); + pg_edit_set_fixed (&p->df_flag, 0); + pg_edit_set_fixed (&p->ce_flag, 0); + + p->length.type = PG_EDIT_UNSPECIFIED; + p->checksum.type = PG_EDIT_UNSPECIFIED; + + if (unformat (input, "%U: %U -> %U", + unformat_pg_edit, + unformat_ip_protocol, &p->protocol, + unformat_pg_edit, + unformat_ip4_address, &p->src_address, + unformat_pg_edit, + unformat_ip4_address, &p->dst_address)) + goto found; + + if (! unformat (input, "%U:", + unformat_pg_edit, + unformat_ip_protocol, &p->protocol)) + goto error; + +found: + /* Parse options. */ + while (1) + { + if (unformat (input, "version %U", + unformat_pg_edit, + unformat_pg_number, &p->ip_version)) + ; + + else if (unformat (input, "header-length %U", + unformat_pg_edit, + unformat_pg_number, &p->header_length)) + ; + + else if (unformat (input, "tos %U", + unformat_pg_edit, + unformat_pg_number, &p->tos)) + ; + + else if (unformat (input, "length %U", + unformat_pg_edit, + unformat_pg_number, &p->length)) + ; + + else if (unformat (input, "checksum %U", + unformat_pg_edit, + unformat_pg_number, &p->checksum)) + ; + + else if (unformat (input, "ttl %U", + unformat_pg_edit, + unformat_pg_number, &p->ttl)) + ; + + else if (unformat (input, "fragment id %U offset %U", + unformat_pg_edit, + unformat_pg_number, &p->fragment_id, + unformat_pg_edit, + unformat_pg_number, &p->fragment_offset)) + { + int i; + for (i = 0; i< ARRAY_LEN (p->fragment_offset.values); i++) + pg_edit_set_value (&p->fragment_offset, i, + pg_edit_get_value (&p->fragment_offset, i) / 8); + + } + + /* Flags. */ + else if (unformat (input, "mf") || unformat (input, "MF")) + pg_edit_set_fixed (&p->mf_flag, 1); + + else if (unformat (input, "df") || unformat (input, "DF")) + pg_edit_set_fixed (&p->df_flag, 1); + + else if (unformat (input, "ce") || unformat (input, "CE")) + pg_edit_set_fixed (&p->ce_flag, 1); + + /* Can't parse input: try next protocol level. */ + else + break; + } + + { + ip_main_t * im = &ip_main; + ip_protocol_t protocol; + ip_protocol_info_t * pi; + + pi = 0; + if (p->protocol.type == PG_EDIT_FIXED) + { + protocol = pg_edit_get_value (&p->protocol, PG_EDIT_LO); + pi = ip_get_protocol_info (im, protocol); + } + + if (pi && pi->unformat_pg_edit + && unformat_user (input, pi->unformat_pg_edit, s)) + ; + + else if (! unformat_user (input, unformat_pg_payload, s)) + goto error; + + if (p->length.type == PG_EDIT_UNSPECIFIED + && s->min_packet_bytes == s->max_packet_bytes + && group_index + 1 < vec_len (s->edit_groups)) + { + pg_edit_set_fixed (&p->length, + pg_edit_group_n_bytes (s, group_index)); + } + + /* Compute IP header checksum if all edits are fixed. */ + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + { + ip4_header_t fixed_header, fixed_mask, cmp_mask; + + /* See if header is all fixed and specified except for + checksum field. */ + memset (&cmp_mask, ~0, sizeof (cmp_mask)); + cmp_mask.checksum = 0; + + pg_edit_group_get_fixed_packet_data (s, group_index, + &fixed_header, &fixed_mask); + if (! memcmp (&fixed_mask, &cmp_mask, sizeof (cmp_mask))) + pg_edit_set_fixed (&p->checksum, + clib_net_to_host_u16 (ip4_header_checksum (&fixed_header))); + } + + p = pg_get_edit_group (s, group_index); + if (p->length.type == PG_EDIT_UNSPECIFIED + || p->checksum.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t * g = pg_stream_get_group (s, group_index); + g->edit_function = ip4_pg_edit_function; + g->edit_function_opaque = 0; + if (p->length.type == PG_EDIT_UNSPECIFIED) + g->edit_function_opaque |= IP4_PG_EDIT_LENGTH; + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + g->edit_function_opaque |= IP4_PG_EDIT_CHECKSUM; + } + + return 1; + } + + error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + diff --git a/vnet/vnet/ip/ip4_source_check.c b/vnet/vnet/ip/ip4_source_check.c new file mode 100644 index 00000000000..47e22f2392e --- /dev/null +++ b/vnet/vnet/ip/ip4_source_check.c @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip4_source_check.c: IP v4 check source address (unicast RPF check) + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +typedef struct { + u8 packet_data[64]; +} ip4_source_check_trace_t; + +static u8 * format_ip4_source_check_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + ip4_source_check_trace_t * t = va_arg (*va, ip4_source_check_trace_t *); + + s = format (s, "%U", + format_ip4_header, + t->packet_data, sizeof (t->packet_data)); + + return s; +} + +typedef enum { + IP4_SOURCE_CHECK_NEXT_DROP, + IP4_SOURCE_CHECK_N_NEXT, +} ip4_source_check_next_t; + +typedef enum { + IP4_SOURCE_CHECK_REACHABLE_VIA_RX, + IP4_SOURCE_CHECK_REACHABLE_VIA_ANY, +} ip4_source_check_type_t; + +typedef union { + struct { + u32 no_default_route : 1; + u32 fib_index : 31; + }; + u32 as_u32[1]; +} ip4_source_check_config_t; + +always_inline uword +ip4_source_check_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + ip4_source_check_type_t source_check_type) +{ + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + ip_config_main_t * cm = &lm->rx_config_mains[VNET_UNICAST]; + u32 n_left_from, * from, * to_next; + u32 next_index; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip4_input_node.index); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (ip4_source_check_trace_t)); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t * p0, * p1; + ip4_header_t * ip0, * ip1; + ip4_fib_mtrie_t * mtrie0, * mtrie1; + ip4_fib_mtrie_leaf_t leaf0, leaf1; + ip4_source_check_config_t * c0, * c1; + ip_adjacency_t * adj0, * adj1; + u32 pi0, next0, pass0, adj_index0; + u32 pi1, next1, pass1, adj_index1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (ip1[0]), LOAD); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + c0 = vnet_get_config_data (&cm->config_main, + &vnet_buffer (p0)->ip.current_config_index, + &next0, + sizeof (c0[0])); + c1 = vnet_get_config_data (&cm->config_main, + &vnet_buffer (p1)->ip.current_config_index, + &next1, + sizeof (c1[0])); + + mtrie0 = &vec_elt_at_index (im->fibs, c0->fib_index)->mtrie; + mtrie1 = &vec_elt_at_index (im->fibs, c1->fib_index)->mtrie; + + leaf0 = leaf1 = IP4_FIB_MTRIE_LEAF_ROOT; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 0); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 1); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 2); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3); + leaf1 = ip4_fib_mtrie_lookup_step (mtrie1, leaf1, &ip1->src_address, 3); + + adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + adj_index1 = ip4_fib_mtrie_leaf_get_adj_index (leaf1); + + ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, c0->fib_index, + &ip0->src_address, + c0->no_default_route)); + ASSERT (adj_index1 == ip4_fib_lookup_with_table (im, c1->fib_index, + &ip1->src_address, + c1->no_default_route)); + + adj0 = ip_get_adjacency (lm, adj_index0); + adj1 = ip_get_adjacency (lm, adj_index1); + + /* Pass multicast. */ + pass0 = ip4_address_is_multicast (&ip0->src_address) || ip0->src_address.as_u32 == clib_host_to_net_u32(0xFFFFFFFF); + pass1 = ip4_address_is_multicast (&ip1->src_address) || ip1->src_address.as_u32 == clib_host_to_net_u32(0xFFFFFFFF); + + pass0 |= (adj0->lookup_next_index == IP_LOOKUP_NEXT_REWRITE + && (source_check_type == IP4_SOURCE_CHECK_REACHABLE_VIA_ANY + || vnet_buffer (p0)->sw_if_index[VLIB_RX] == adj0->rewrite_header.sw_if_index)); + pass1 |= (adj1->lookup_next_index == IP_LOOKUP_NEXT_REWRITE + && (source_check_type == IP4_SOURCE_CHECK_REACHABLE_VIA_ANY + || vnet_buffer (p1)->sw_if_index[VLIB_RX] == adj1->rewrite_header.sw_if_index)); + + next0 = (pass0 ? next0 : IP4_SOURCE_CHECK_NEXT_DROP); + next1 = (pass1 ? next1 : IP4_SOURCE_CHECK_NEXT_DROP); + + p0->error = error_node->errors[IP4_ERROR_UNICAST_SOURCE_CHECK_FAILS]; + p1->error = error_node->errors[IP4_ERROR_UNICAST_SOURCE_CHECK_FAILS]; + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + ip4_fib_mtrie_t * mtrie0; + ip4_fib_mtrie_leaf_t leaf0; + ip4_source_check_config_t * c0; + ip_adjacency_t * adj0; + u32 pi0, next0, pass0, adj_index0; + + pi0 = from[0]; + to_next[0] = pi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + ip0 = vlib_buffer_get_current (p0); + + c0 = vnet_get_config_data (&cm->config_main, + &vnet_buffer (p0)->ip.current_config_index, + &next0, + sizeof (c0[0])); + + mtrie0 = &vec_elt_at_index (im->fibs, c0->fib_index)->mtrie; + + leaf0 = IP4_FIB_MTRIE_LEAF_ROOT; + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 0); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 1); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 2); + + leaf0 = ip4_fib_mtrie_lookup_step (mtrie0, leaf0, &ip0->src_address, 3); + + adj_index0 = ip4_fib_mtrie_leaf_get_adj_index (leaf0); + + ASSERT (adj_index0 == ip4_fib_lookup_with_table (im, c0->fib_index, + &ip0->src_address, + c0->no_default_route)); + adj0 = ip_get_adjacency (lm, adj_index0); + + /* Pass multicast. */ + pass0 = ip4_address_is_multicast (&ip0->src_address) || ip0->src_address.as_u32 == clib_host_to_net_u32(0xFFFFFFFF); + + pass0 |= (adj0->lookup_next_index == IP_LOOKUP_NEXT_REWRITE + && (source_check_type == IP4_SOURCE_CHECK_REACHABLE_VIA_ANY + || vnet_buffer (p0)->sw_if_index[VLIB_RX] == adj0->rewrite_header.sw_if_index)); + + next0 = (pass0 ? next0 : IP4_SOURCE_CHECK_NEXT_DROP); + p0->error = error_node->errors[IP4_ERROR_UNICAST_SOURCE_CHECK_FAILS]; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static uword +ip4_source_check_reachable_via_any (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip4_source_check_inline (vm, node, frame, IP4_SOURCE_CHECK_REACHABLE_VIA_ANY); +} + +static uword +ip4_source_check_reachable_via_rx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip4_source_check_inline (vm, node, frame, IP4_SOURCE_CHECK_REACHABLE_VIA_RX); +} + +VLIB_REGISTER_NODE (ip4_check_source_reachable_via_any) = { + .function = ip4_source_check_reachable_via_any, + .name = "ip4-source-check-via-any", + .vector_size = sizeof (u32), + + .n_next_nodes = IP4_SOURCE_CHECK_N_NEXT, + .next_nodes = { + [IP4_SOURCE_CHECK_NEXT_DROP] = "error-drop", + }, + + .format_buffer = format_ip4_header, + .format_trace = format_ip4_source_check_trace, +}; + +VLIB_REGISTER_NODE (ip4_check_source_reachable_via_rx) = { + .function = ip4_source_check_reachable_via_rx, + .name = "ip4-source-check-via-rx", + .vector_size = sizeof (u32), + + .n_next_nodes = IP4_SOURCE_CHECK_N_NEXT, + .next_nodes = { + [IP4_SOURCE_CHECK_NEXT_DROP] = "error-drop", + }, + + .format_buffer = format_ip4_header, + .format_trace = format_ip4_source_check_trace, +}; + +static clib_error_t * +set_ip_source_check (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + ip4_main_t * im = &ip4_main; + ip_lookup_main_t * lm = &im->lookup_main; + ip_config_main_t * rx_cm = &lm->rx_config_mains[VNET_UNICAST]; + clib_error_t * error = 0; + u32 sw_if_index, is_del, ci; + ip4_source_check_config_t config; + ip4_rx_feature_type_t type; + + sw_if_index = ~0; + + if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + error = clib_error_return (0, "unknown interface `%U'", + format_unformat_error, input); + goto done; + } + + is_del = 0; + config.no_default_route = 0; + config.fib_index = im->fib_index_by_sw_if_index[sw_if_index]; + type = IP4_RX_FEATURE_SOURCE_CHECK_REACHABLE_VIA_RX; + if (unformat (input, "del")) + is_del = 1; + + ci = rx_cm->config_index_by_sw_if_index[sw_if_index]; + ci = (is_del + ? vnet_config_del_feature + : vnet_config_add_feature) + (vm, &rx_cm->config_main, + ci, + type, + &config, + sizeof (config)); + rx_cm->config_index_by_sw_if_index[sw_if_index] = ci; + + done: + return error; +} + +VLIB_CLI_COMMAND (set_interface_ip_source_check_command, static) = { + .path = "set interface ip source-check", + .function = set_ip_source_check, + .short_help = "Set IP4/IP6 interface unicast source check", +}; + +/* Dummy init function to get us linked in. */ +clib_error_t * ip4_source_check_init (vlib_main_t * vm) +{ return 0; } + +VLIB_INIT_FUNCTION (ip4_source_check_init); diff --git a/vnet/vnet/ip/ip4_test.c b/vnet/vnet/ip/ip4_test.c new file mode 100644 index 00000000000..ff088e78f3e --- /dev/null +++ b/vnet/vnet/ip/ip4_test.c @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> + +/* + * ip4 FIB tester. Add, probe, delete a bunch of + * random routes / masks and make sure that the mtrie agrees with + * the hash-table FIB. + * + * Manipulate the FIB by means of the debug CLI commands, to minimize + * the chances of doing something idiotic. + */ + +/* + * These routines need to be redeclared non-static elsewhere. + * + * Also: rename ip_route() -> vnet_ip_route_cmd() and add the usual + * test_route_init() call to main.c + */ +clib_error_t * +vnet_ip_route_cmd (vlib_main_t * vm, + unformat_input_t * input, vlib_cli_command_t * cmd_arg); + +int ip4_lookup_validate (ip4_address_t *a, u32 fib_index0); + +ip4_fib_t * +find_fib_by_table_index_or_id (ip4_main_t * im, u32 table_index_or_id, + u32 flags); + +/* Routes to insert/delete/probe in FIB */ +typedef struct { + ip4_address_t address; + u32 mask_width; + u32 interface_id; /* not an xx_if_index */ +} test_route_t; + +typedef struct { + /* Test routes in use */ + test_route_t *route_pool; + + /* Number of fake ethernets created */ + u32 test_interfaces_created; +} test_main_t; + +test_main_t test_main; + +/* fake ethernet device class, distinct from "fake-ethX" */ +static u8 * format_test_interface_name (u8 * s, va_list * args) +{ + u32 dev_instance = va_arg (*args, u32); + return format (s, "test-eth%d", dev_instance); +} + +static uword dummy_interface_tx (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + clib_warning ("you shouldn't be here, leaking buffers..."); + return frame->n_vectors; +} + +VNET_DEVICE_CLASS (test_interface_device_class,static) = { + .name = "Test interface", + .format_device_name = format_test_interface_name, + .tx_function = dummy_interface_tx, +}; + +static clib_error_t * +thrash (vlib_main_t * vm, + unformat_input_t * main_input, vlib_cli_command_t * cmd_arg) +{ + u32 seed = 0xdeaddabe; + u32 niter = 10; + u32 nroutes = 10; + u32 ninterfaces = 4; + f64 min_mask_bits = 7.0; + f64 max_mask_bits = 32.0; + u32 table_id = 11; /* my amp goes to 11 (use fib 11) */ + u32 table_index; + int iter, i; + u8 * cmd; + test_route_t *tr; + test_main_t *tm = &test_main; + ip4_main_t * im = &ip4_main; + vnet_main_t * vnm = vnet_get_main(); + unformat_input_t cmd_input; + f64 rf; + u32 *masks = 0; + u32 tmp; + u32 hw_if_index; + clib_error_t * error = 0; + uword *p; + unformat_input_t _line_input, * line_input = &_line_input; + u8 hw_address[6]; + ip4_fib_t * fib; + int verbose = 0; + + /* Precompute mask width -> mask vector */ + tmp = (u32)~0; + vec_validate (masks, 32); + for (i = 32; i > 0; i--) + { + masks [i] = tmp; + tmp <<= 1; + } + + if (unformat_user (main_input, unformat_line_input, line_input)) + { + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "seed %d", &seed)) + ; + else if (unformat (line_input, "niter %d", &niter)) + ; + else if (unformat (line_input, "nroutes %d", &nroutes)) + ; + else if (unformat (line_input, "ninterfaces %d", &ninterfaces)) + ; + else if (unformat (line_input, "min-mask-bits %d", &tmp)) + min_mask_bits = (f64) tmp; + else if (unformat (line_input, "max-mask-bits %d", &tmp)) + max_mask_bits = (f64) tmp; + else if (unformat (line_input, "verbose")) + verbose = 1; + else + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, line_input); + } + } + + /* Find or create FIB table 11 */ + fib = find_ip4_fib_by_table_index_or_id (im, table_id, IP4_ROUTE_FLAG_TABLE_ID); + + for (i = tm->test_interfaces_created; i < ninterfaces; i++) + { + vnet_hw_interface_t * hw; + memset (hw_address, 0, sizeof (hw_address)); + hw_address[0] = 0xd0; + hw_address[1] = 0x0f; + hw_address[5] = i; + + error = ethernet_register_interface + (vnm, + test_interface_device_class.index, + i /* instance */, + hw_address, + &hw_if_index, + /* flag change */ 0); + + /* Fake interfaces use FIB table 11 */ + hw = vnet_get_hw_interface (vnm, hw_if_index); + vec_validate (im->fib_index_by_sw_if_index, hw->sw_if_index); + im->fib_index_by_sw_if_index[hw->sw_if_index] = fib->index; + } + + tm->test_interfaces_created = ninterfaces; + + /* Find fib index corresponding to FIB id 11 */ + p = hash_get (im->fib_index_by_table_id, table_id); + if (p == 0) + { + vlib_cli_output (vm, "Couldn't map fib id %d to fib index\n", + table_id); + return 0; + } + table_index = p[0]; + + for (iter = 0; iter < niter; iter++) + { + /* Pick random routes to install */ + for (i = 0; i < nroutes; i++) + { + int j; + + pool_get (tm->route_pool, tr); + memset (tr, 0, sizeof (*tr)); + + again: + rf = random_f64 (&seed); + tr->mask_width = (u32) (min_mask_bits + + rf * (max_mask_bits - min_mask_bits)); + tmp = random_u32 (&seed); + tmp &= masks[tr->mask_width]; + tr->address.as_u32 = clib_host_to_net_u32(tmp); + + /* We can't add the same address/mask twice... */ + for (j = 0; j < i; j++) + { + test_route_t *prev; + prev = pool_elt_at_index (tm->route_pool, j); + if ((prev->address.as_u32 == tr->address.as_u32) + && (prev->mask_width == tr->mask_width)) + goto again; + } + + rf = random_f64 (&seed); + tr->interface_id = (u32) (rf * ninterfaces); + } + + /* Add them */ + for (i = 0; i < nroutes; i++) + { + tr = pool_elt_at_index (tm->route_pool, i); + cmd = format (0, "add table %d %U/%d via test-eth%d", + table_id, + format_ip4_address, &tr->address, + tr->mask_width, tr->interface_id); + vec_add1(cmd,0); + if (verbose) + fformat(stderr, "ip route %s\n", cmd); + unformat_init_string (&cmd_input, (char *) cmd, vec_len(cmd)-1); + error = vnet_ip_route_cmd (vm, &cmd_input, cmd_arg); + if (error) + clib_error_report(error); + unformat_free (&cmd_input); + vec_free(cmd); + } + /* Probe them */ + for (i = 0; i < nroutes; i++) + { + tr = pool_elt_at_index (tm->route_pool, i); + if (!ip4_lookup_validate (&tr->address, table_index)) + { + if (verbose) + fformat (stderr, "test lookup table %d %U\n", + table_index, format_ip4_address, &tr->address); + + fformat (stderr, "FAIL-after-insert: %U/%d\n", + format_ip4_address, &tr->address, + tr->mask_width); + } + } + + /* Delete them */ + for (i = 0; i < nroutes; i++) + { + int j; + tr = pool_elt_at_index (tm->route_pool, i); + if (0) + cmd = format (0, "del table %d %U/%d via test-eth%d", + table_id, + format_ip4_address, &tr->address, + tr->mask_width, tr->interface_id); + else + cmd = format (0, "del table %d %U/%d", + table_id, + format_ip4_address, &tr->address, + tr->mask_width); + vec_add1(cmd,0); + if (verbose) + fformat(stderr, "ip route %s\n", cmd); + unformat_init_string (&cmd_input, (char *) cmd, vec_len(cmd)-1); + error = vnet_ip_route_cmd (vm, &cmd_input, cmd_arg); + if (error) + clib_error_report(error); + unformat_free (&cmd_input); + vec_free(cmd); + + /* Make sure all undeleted routes still work */ + for (j = i+1; j < nroutes; j++) + { + test_route_t *rr; /* remaining route */ + rr = pool_elt_at_index (tm->route_pool, j); + if (!ip4_lookup_validate (&rr->address, table_index)) + { + if (verbose) + fformat (stderr, "test lookup table %d %U\n", + table_index, format_ip4_address, &rr->address); + + fformat (stderr, "FAIL: %U/%d AWOL\n", + format_ip4_address, &rr->address, + rr->mask_width); + fformat (stderr, " iter %d after %d of %d deletes\n", + iter, i, nroutes); + fformat (stderr, " last route deleted %U/%d\n", + format_ip4_address, &tr->address, + tr->mask_width); + } + } + } + + pool_free (tm->route_pool); + } + return 0; +} + +VLIB_CLI_COMMAND (test_route_command, static) = { + .path = "test route", + .short_help = "test route", + .function = thrash, +}; + +clib_error_t *test_route_init (vlib_main_t *vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (test_route_init); diff --git a/vnet/vnet/ip/ip6.h b/vnet/vnet/ip/ip6.h new file mode 100644 index 00000000000..a5c322a2fa5 --- /dev/null +++ b/vnet/vnet/ip/ip6.h @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip6.h: ip6 main include file + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip_ip6_h +#define included_ip_ip6_h + +#include <vlib/mc.h> +#include <vnet/ip/ip6_packet.h> +#include <vnet/ip/lookup.h> + +#include <vppinfra/bihash_24_8.h> +#include <vppinfra/bihash_template.h> + +/* + * Default size of the ip6 fib hash table + */ +#define IP6_FIB_DEFAULT_HASH_NUM_BUCKETS (64 * 1024) +#define IP6_FIB_DEFAULT_HASH_MEMORY_SIZE (32<<20) + +typedef struct { + ip6_address_t addr; + u32 dst_address_length; + u32 vrf_index; +} ip6_fib_key_t; + +typedef struct { + /* Table ID (hash key) for this FIB. */ + u32 table_id; + + /* Index into FIB vector. */ + u32 index; + + /* flow hash configuration */ + u32 flow_hash_config; +} ip6_fib_t; + +struct ip6_main_t; + +typedef void (ip6_add_del_route_function_t) + (struct ip6_main_t * im, + uword opaque, + ip6_fib_t * fib, + u32 flags, + ip6_address_t * address, + u32 address_length, + void * old_result, + void * new_result); + +typedef struct { + ip6_add_del_route_function_t * function; + uword required_flags; + uword function_opaque; +} ip6_add_del_route_callback_t; + +typedef void (ip6_add_del_interface_address_function_t) + (struct ip6_main_t * im, + uword opaque, + u32 sw_if_index, + ip6_address_t * address, + u32 address_length, + u32 if_address_index, + u32 is_del); + +typedef struct { + ip6_add_del_interface_address_function_t * function; + uword function_opaque; +} ip6_add_del_interface_address_callback_t; + +typedef enum { + /* First check access list to either permit or deny this + packet based on classification. */ + IP6_RX_FEATURE_CHECK_ACCESS, + + /* RPF check: verify that source address is reachable via + RX interface or via any interface. */ + IP6_RX_FEATURE_CHECK_SOURCE_REACHABLE_VIA_RX, + IP6_RX_FEATURE_CHECK_SOURCE_REACHABLE_VIA_ANY, + + /* IPSec */ + IP6_RX_FEATURE_IPSEC, + + /* Intercept and decap L2TPv3 packets. */ + IP6_RX_FEATURE_L2TPV3, + + /* vPath forwarding: won't return to call next feature + so any feature needed before vPath forwarding must be prior + to this entry */ + IP6_RX_FEATURE_VPATH, + + /* Must be last: perform forwarding lookup. */ + IP6_RX_FEATURE_LOOKUP, + + IP6_N_RX_FEATURE, +} ip6_rx_feature_type_t; + +typedef struct ip6_main_t { + BVT(clib_bihash) ip6_lookup_table; + + ip_lookup_main_t lookup_main; + + /* bitmap / refcounts / vector of mask widths to search */ + uword * non_empty_dst_address_length_bitmap; + u8 * prefix_lengths_in_search_order; + i32 dst_address_length_refcounts[129]; + + /* Vector of FIBs. */ + ip6_fib_t * fibs; + + ip6_address_t fib_masks[129]; + + /* Table index indexed by software interface. */ + u32 * fib_index_by_sw_if_index; + + /* Hash table mapping table id to fib index. + ID space is not necessarily dense; index space is dense. */ + uword * fib_index_by_table_id; + + /* Vector of functions to call when routes are added/deleted. */ + ip6_add_del_route_callback_t * add_del_route_callbacks; + + /* Hash table mapping interface rewrite adjacency index by sw if index. */ + uword * interface_route_adj_index_by_sw_if_index; + + /* Functions to call when interface address changes. */ + ip6_add_del_interface_address_callback_t * add_del_interface_address_callbacks; + + /* Template used to generate IP6 neighbor solicitation packets. */ + vlib_packet_template_t discover_neighbor_packet_template; + + u32 * discover_neighbor_next_index_by_hw_if_index; + + /* ip6 lookup table config parameters */ + u32 lookup_table_nbuckets; + uword lookup_table_size; + + /* Seed for Jenkins hash used to compute ip6 flow hash. */ + u32 flow_hash_seed; + + struct { + /* TTL to use for host generated packets. */ + u8 ttl; + + u8 pad[3]; + } host_config; +} ip6_main_t; + +/* Global ip6 main structure. */ +extern ip6_main_t ip6_main; + +/* Global ip6 input node. Errors get attached to ip6 input node. */ +extern vlib_node_registration_t ip6_input_node; +extern vlib_node_registration_t ip6_rewrite_node; +extern vlib_node_registration_t ip6_discover_neighbor_node; + +extern vlib_node_registration_t ip6_icmp_neighbor_discovery_event_node; + +/* ipv6 neighbor discovery - timer/event types */ +typedef enum { + ICMP6_ND_EVENT_INIT, +} ip6_icmp_neighbor_discovery_event_type_t; + +typedef union { + u32 add_del_swindex; + struct { + u32 up_down_swindex; + u32 fib_index; + } up_down_event; +} ip6_icmp_neighbor_discovery_event_data_t; + +u32 ip6_fib_lookup (ip6_main_t * im, u32 sw_if_index, ip6_address_t * dst); +u32 ip6_fib_lookup_with_table (ip6_main_t * im, u32 fib_index, + ip6_address_t * dst); +ip6_fib_t * find_ip6_fib_by_table_index_or_id (ip6_main_t * im, + u32 table_index_or_id, + u32 flags); + +always_inline uword +ip6_destination_matches_route (ip6_main_t * im, + ip6_address_t * key, + ip6_address_t * dest, + uword dest_length) +{ + int i; + for (i = 0; i < ARRAY_LEN (key->as_uword); i++) + { + if ((key->as_uword[i] ^ dest->as_uword[i]) & im->fib_masks[dest_length].as_uword[i]) + return 0; + } + return 1; +} + +always_inline uword +ip6_destination_matches_interface (ip6_main_t * im, + ip6_address_t * key, + ip_interface_address_t * ia) +{ + ip6_address_t * a = ip_interface_address_get_address (&im->lookup_main, ia); + return ip6_destination_matches_route (im, key, a, ia->address_length); +} + +/* As above but allows for unaligned destinations (e.g. works right from IP header of packet). */ +always_inline uword +ip6_unaligned_destination_matches_route (ip6_main_t * im, + ip6_address_t * key, + ip6_address_t * dest, + uword dest_length) +{ + int i; + for (i = 0; i < ARRAY_LEN (key->as_uword); i++) + { + if ((clib_mem_unaligned (&key->as_uword[i], uword) ^ dest->as_uword[i]) & im->fib_masks[dest_length].as_uword[i]) + return 0; + } + return 1; +} + +always_inline void +ip6_src_address_for_packet (ip6_main_t * im, vlib_buffer_t * p, ip6_address_t * src, u32 sw_if_index) +{ + ip_lookup_main_t * lm = &im->lookup_main; + ip_interface_address_t * ia = ip_interface_address_for_packet (lm, p, sw_if_index); + ip6_address_t * a = ip_interface_address_get_address (lm, ia); + *src = a[0]; +} + +always_inline u32 +ip6_src_lookup_for_packet (ip6_main_t * im, vlib_buffer_t * b, ip6_header_t * i) +{ + if (vnet_buffer (b)->ip.adj_index[VLIB_RX] == ~0) + vnet_buffer (b)->ip.adj_index[VLIB_RX] + = ip6_fib_lookup (im, vnet_buffer (b)->sw_if_index[VLIB_RX], + &i->src_address); + return vnet_buffer (b)->ip.adj_index[VLIB_RX]; +} + +/* Find interface address which matches destination. */ +always_inline ip6_address_t * +ip6_interface_address_matching_destination (ip6_main_t * im, ip6_address_t * dst, u32 sw_if_index, + ip_interface_address_t ** result_ia) +{ + ip_lookup_main_t * lm = &im->lookup_main; + ip_interface_address_t * ia; + ip6_address_t * result = 0; + + foreach_ip_interface_address (lm, ia, sw_if_index, + 1 /* honor unnumbered */, + ({ + ip6_address_t * a = ip_interface_address_get_address (lm, ia); + if (ip6_destination_matches_route (im, dst, a, ia->address_length)) + { + result = a; + break; + } + })); + if (result_ia) + *result_ia = result ? ia : 0; + return result; +} + +clib_error_t * +ip6_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index, + ip6_address_t * address, u32 address_length, + u32 is_del); + +int ip6_address_compare (ip6_address_t * a1, ip6_address_t * a2); + +/* Add/del a route to the FIB. */ + +#define IP6_ROUTE_FLAG_ADD (0 << 0) +#define IP6_ROUTE_FLAG_DEL (1 << 0) +#define IP6_ROUTE_FLAG_TABLE_ID (0 << 1) +#define IP6_ROUTE_FLAG_FIB_INDEX (1 << 1) +#define IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY (1 << 2) +#define IP6_ROUTE_FLAG_NO_REDISTRIBUTE (1 << 3) +#define IP6_ROUTE_FLAG_NOT_LAST_IN_GROUP (1 << 4) +/* Dynamic route created via neighbor discovery. */ +#define IP6_ROUTE_FLAG_NEIGHBOR (1 << 5) + +typedef struct { + /* IP6_ROUTE_FLAG_* */ + u32 flags; + + /* Either index of fib or table_id to hash and get fib. + IP6_ROUTE_FLAG_FIB_INDEX specifies index; otherwise table_id is assumed. */ + u32 table_index_or_table_id; + + /* Destination address (prefix) and length. */ + ip6_address_t dst_address; + u32 dst_address_length; + + /* Adjacency to use for this destination. */ + u32 adj_index; + + /* If specified adjacencies to add and then + use for this destination. add_adj/n_add_adj + are override adj_index if specified. */ + ip_adjacency_t * add_adj; + u32 n_add_adj; +} ip6_add_del_route_args_t; + +void ip6_add_del_route (ip6_main_t * im, ip6_add_del_route_args_t * args); + +void ip6_add_del_route_next_hop (ip6_main_t * im, + u32 flags, + ip6_address_t * dst_address, + u32 dst_address_length, + ip6_address_t * next_hop, + u32 next_hop_sw_if_index, + u32 next_hop_weight, u32 adj_index, + u32 explicit_fib_index); +u32 +ip6_get_route (ip6_main_t * im, + u32 fib_index_or_table_id, + u32 flags, + ip6_address_t * address, + u32 address_length); + +void +ip6_foreach_matching_route (ip6_main_t * im, + u32 table_index_or_table_id, + u32 flags, + ip6_address_t * address, + u32 address_length, + ip6_address_t ** results, + u8 ** result_length); + +void ip6_delete_matching_routes (ip6_main_t * im, + u32 table_index_or_table_id, + u32 flags, + ip6_address_t * address, + u32 address_length); + +void ip6_maybe_remap_adjacencies (ip6_main_t * im, + u32 table_index_or_table_id, + u32 flags); + +void ip6_adjacency_set_interface_route (vnet_main_t * vnm, + ip_adjacency_t * adj, + u32 sw_if_index, + u32 if_address_index); + +clib_error_t * +ip6_probe_neighbor (vlib_main_t * vm, ip6_address_t * dst, u32 sw_if_index); + +clib_error_t * +ip6_set_neighbor_limit (u32 neighbor_limit); + +uword +ip6_tcp_register_listener (vlib_main_t * vm, + u16 dst_port, + u32 next_node_index); +uword +ip6_udp_register_listener (vlib_main_t * vm, + u16 dst_port, + u32 next_node_index); + +u16 ip6_tcp_udp_icmp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, ip6_header_t * ip0, int *bogus_lengthp); + +void ip6_register_protocol (u32 protocol, u32 node_index); + +serialize_function_t serialize_vnet_ip6_main, unserialize_vnet_ip6_main; + +int +vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm, + u32 sw_if_index, + ip6_address_t * a, + u8 * link_layer_address, + uword n_bytes_link_layer_address); +int +vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm, + u32 sw_if_index, + ip6_address_t * a, + u8 * link_layer_address, + uword n_bytes_link_layer_address); +void +vnet_ip6_fib_init (ip6_main_t * im, u32 fib_index); + +void +ip6_link_local_address_from_ethernet_mac_address (ip6_address_t *ip, + u8 *mac); + +void +ip6_ethernet_mac_address_from_link_local_address (u8 *mac, + ip6_address_t *ip); + +int vnet_set_ip6_flow_hash (u32 table_id, u32 flow_hash_config); + +int +ip6_neighbor_ra_config(vlib_main_t * vm, u32 sw_if_index, + u8 surpress, u8 managed, u8 other, + u8 ll_option, u8 send_unicast, u8 cease, + u8 use_lifetime, u32 lifetime, + u32 initial_count, u32 initial_interval, + u32 max_interval, u32 min_interval, + u8 is_no); + +int +ip6_neighbor_ra_prefix(vlib_main_t * vm, u32 sw_if_index, + ip6_address_t *prefix_addr, u8 prefix_len, + u8 use_default, u32 val_lifetime, u32 pref_lifetime, + u8 no_advertise, u8 off_link, u8 no_autoconfig, u8 no_onlink, + u8 is_no); + + +clib_error_t * +enable_ip6_interface(vlib_main_t * vm, + u32 sw_if_index); + +clib_error_t * +disable_ip6_interface(vlib_main_t * vm, + u32 sw_if_index); + +int +ip6_interface_enabled(vlib_main_t * vm, + u32 sw_if_index); + +clib_error_t * +set_ip6_link_local_address(vlib_main_t * vm, + u32 sw_if_index, + ip6_address_t *address, + u8 address_length); + +void vnet_register_ip6_neighbor_resolution_event(vnet_main_t * vnm, + void * address_arg, + uword node_index, + uword type_opaque, + uword data); + +int vnet_set_ip6_classify_intfc (vlib_main_t * vm, u32 sw_if_index, + u32 table_index); +extern vlib_node_registration_t ip6_lookup_node; + +/* Compute flow hash. We'll use it to select which Sponge to use for this + flow. And other things. */ +always_inline u32 +ip6_compute_flow_hash (ip6_header_t * ip, u32 flow_hash_config) +{ + tcp_header_t * tcp = (void *) (ip + 1); + u64 a, b, c; + u64 t1, t2; + uword is_tcp_udp = (ip->protocol == IP_PROTOCOL_TCP + || ip->protocol == IP_PROTOCOL_UDP); + + t1 = (ip->src_address.as_u64[0] ^ ip->src_address.as_u64[1]); + t1 = (flow_hash_config & IP_FLOW_HASH_SRC_ADDR) ? t1 : 0; + + t2 = (ip->dst_address.as_u64[0] ^ ip->dst_address.as_u64[1]); + t2 = (flow_hash_config & IP_FLOW_HASH_DST_ADDR) ? t2 : 0; + + a = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t2 : t1; + b = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? t1 : t2; + b ^= (flow_hash_config & IP_FLOW_HASH_PROTO) ? ip->protocol : 0; + + t1 = is_tcp_udp ? tcp->ports.src : 0; + t2 = is_tcp_udp ? tcp->ports.dst : 0; + + t1 = (flow_hash_config & IP_FLOW_HASH_SRC_PORT) ? t1 : 0; + t2 = (flow_hash_config & IP_FLOW_HASH_DST_PORT) ? t2 : 0; + + c = (flow_hash_config & IP_FLOW_HASH_REVERSE_SRC_DST) ? + ((t1<<16) | t2) : ((t2<<16) | t1); + + hash_mix64 (a, b, c); + return (u32) c; +} + +#endif /* included_ip_ip6_h */ diff --git a/vnet/vnet/ip/ip6_error.h b/vnet/vnet/ip/ip6_error.h new file mode 100644 index 00000000000..93754a10fcc --- /dev/null +++ b/vnet/vnet/ip/ip6_error.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip6_error.h: ip6 fast path errors + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip_ip6_error_h +#define included_ip_ip6_error_h + +#define foreach_ip6_error \ + /* Must be first. */ \ + _ (NONE, "valid ip6 packets") \ + \ + /* Errors signalled by ip6-input */ \ + _ (TOO_SHORT, "ip6 length < 40 bytes") \ + _ (BAD_LENGTH, "ip6 length > l2 length") \ + _ (VERSION, "ip6 version != 6") \ + _ (TIME_EXPIRED, "ip6 ttl <= 1") \ + \ + /* Errors signalled by ip6-rewrite. */ \ + _ (MTU_EXCEEDED, "ip6 MTU exceeded") \ + _ (DST_LOOKUP_MISS, "ip6 destination lookup miss") \ + _ (SRC_LOOKUP_MISS, "ip6 source lookup miss") \ + _ (ADJACENCY_DROP, "ip6 adjacency drop") \ + _ (ADJACENCY_PUNT, "ip6 adjacency punt") \ + \ + /* Errors signalled by ip6-local. */ \ + _ (UNKNOWN_PROTOCOL, "unknown ip protocol") \ + _ (UDP_CHECKSUM, "bad udp checksum") \ + _ (TCP_CHECKSUM, "bad tcp checksum") \ + _ (ICMP_CHECKSUM, "bad icmp checksum") \ + _ (UDP_LENGTH, "inconsistent udp/ip lengths") \ + \ + /* Errors signalled by {tcp6,udp6}-lookup. */ \ + _ (UNKNOWN_UDP_PORT, "no listener for udp port") \ + _ (UNKNOWN_TCP_PORT, "no listener for tcp port") \ + \ + /* Spoofed packets in ip6-rewrite-local */ \ + _(SPOOFED_LOCAL_PACKETS, "ip4 spoofed local-address packet drops") \ + \ + /* Erros singalled by ip6-inacl */ \ + _ (INACL_TABLE_MISS, "input ACL table-miss drops") \ + _ (INACL_SESSION_DENY, "input ACL session deny drops") + +typedef enum { +#define _(sym,str) IP6_ERROR_##sym, + foreach_ip6_error +#undef _ + IP6_N_ERROR, +} ip6_error_t; + +#endif /* included_ip_ip6_error_h */ diff --git a/vnet/vnet/ip/ip6_format.c b/vnet/vnet/ip/ip6_format.c new file mode 100644 index 00000000000..1a2810e16ec --- /dev/null +++ b/vnet/vnet/ip/ip6_format.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip6_format.c: ip6 formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +/* Format an IP6 address. */ +u8 * format_ip6_address (u8 * s, va_list * args) +{ + ip6_address_t * a = va_arg (*args, ip6_address_t *); + u32 max_zero_run = 0, this_zero_run = 0; + int max_zero_run_index = -1, this_zero_run_index=0; + int in_zero_run = 0, i; + int last_double_colon = 0; + + /* Ugh, this is a pain. Scan forward looking for runs of 0's */ + for (i = 0; i < ARRAY_LEN (a->as_u16); i++) + { + if (a->as_u16[i] == 0) + { + if (in_zero_run) + this_zero_run++; + else + { + in_zero_run = 1; + this_zero_run =1; + this_zero_run_index = i; + } + } + else + { + if (in_zero_run) + { + /* offer to compress the biggest run of > 1 zero */ + if (this_zero_run > max_zero_run && this_zero_run > 1) + { + max_zero_run_index = this_zero_run_index; + max_zero_run = this_zero_run; + } + } + in_zero_run = 0; + this_zero_run = 0; + } + } + + if (in_zero_run) + { + if (this_zero_run > max_zero_run && this_zero_run > 1) + { + max_zero_run_index = this_zero_run_index; + max_zero_run = this_zero_run; + } + } + + for (i = 0; i < ARRAY_LEN (a->as_u16); i++) + { + if (i == max_zero_run_index) + { + s = format (s, "::"); + i += max_zero_run - 1; + last_double_colon = 1; + } + else + { + s = format (s, "%s%x", + (last_double_colon || i == 0) ? "" : ":", + clib_net_to_host_u16 (a->as_u16[i])); + last_double_colon = 0; + } + } + + return s; +} + +/* Format an IP6 route destination and length. */ +u8 * format_ip6_address_and_length (u8 * s, va_list * args) +{ + ip6_address_t * a = va_arg (*args, ip6_address_t *); + u8 l = va_arg (*args, u32); + return format (s, "%U/%d", format_ip6_address, a, l); +} + +/* Parse an IP6 address. */ +uword unformat_ip6_address (unformat_input_t * input, va_list * args) +{ + ip6_address_t * result = va_arg (*args, ip6_address_t *); + u16 hex_quads[8]; + uword hex_quad, n_hex_quads, hex_digit, n_hex_digits; + uword c, n_colon, double_colon_index; + + n_hex_quads = hex_quad = n_hex_digits = n_colon = 0; + double_colon_index = ARRAY_LEN (hex_quads); + while ((c = unformat_get_input (input)) != UNFORMAT_END_OF_INPUT) + { + hex_digit = 16; + if (c >= '0' && c <= '9') + hex_digit = c - '0'; + else if (c >= 'a' && c <= 'f') + hex_digit = c + 10 - 'a'; + else if (c >= 'A' && c <= 'F') + hex_digit = c + 10 - 'A'; + else if (c == ':' && n_colon < 2) + n_colon++; + else + { + unformat_put_input (input); + break; + } + + /* Too many hex quads. */ + if (n_hex_quads >= ARRAY_LEN (hex_quads)) + return 0; + + if (hex_digit < 16) + { + hex_quad = (hex_quad << 4) | hex_digit; + + /* Hex quad must fit in 16 bits. */ + if (n_hex_digits >= 4) + return 0; + + n_colon = 0; + n_hex_digits++; + } + + /* Save position of :: */ + if (n_colon == 2) + { + /* More than one :: ? */ + if (double_colon_index < ARRAY_LEN (hex_quads)) + return 0; + double_colon_index = n_hex_quads; + } + + if (n_colon > 0 && n_hex_digits > 0) + { + hex_quads[n_hex_quads++] = hex_quad; + hex_quad = 0; + n_hex_digits = 0; + } + } + + if (n_hex_digits > 0) + hex_quads[n_hex_quads++] = hex_quad; + + { + word i; + + /* Expand :: to appropriate number of zero hex quads. */ + if (double_colon_index < ARRAY_LEN (hex_quads)) + { + word n_zero = ARRAY_LEN (hex_quads) - n_hex_quads; + + for (i = n_hex_quads - 1; i >= (signed) double_colon_index; i--) + hex_quads[n_zero + i] = hex_quads[i]; + + for (i = 0; i < n_zero; i++) + { + ASSERT ((double_colon_index + i) < ARRAY_LEN (hex_quads)); + hex_quads[double_colon_index + i] = 0; + } + + n_hex_quads = ARRAY_LEN (hex_quads); + } + + /* Too few hex quads given. */ + if (n_hex_quads < ARRAY_LEN (hex_quads)) + return 0; + + for (i = 0; i < ARRAY_LEN (hex_quads); i++) + result->as_u16[i] = clib_host_to_net_u16 (hex_quads[i]); + + return 1; + } +} + +/* Format an IP6 header. */ +u8 * format_ip6_header (u8 * s, va_list * args) +{ + ip6_header_t * ip = va_arg (*args, ip6_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + u32 i, ip_version, traffic_class, flow_label; + uword indent; + + /* Nothing to do. */ + if (max_header_bytes < sizeof (ip[0])) + return format (s, "IP header truncated"); + + indent = format_get_indent (s); + indent += 2; + + s = format (s, "%U: %U -> %U", + format_ip_protocol, ip->protocol, + format_ip6_address, &ip->src_address, + format_ip6_address, &ip->dst_address); + + i = clib_net_to_host_u32 (ip->ip_version_traffic_class_and_flow_label); + ip_version = (i >> 28); + traffic_class = (i >> 20) & 0xff; + flow_label = i & pow2_mask (20); + + if (ip_version != 6) + s = format (s, "\n%Uversion %d", + format_white_space, indent, ip_version); + + s = format (s, "\n%Utos 0x%02x, flow label 0x%x, hop limit %d, payload length %d", + format_white_space, indent, + traffic_class, flow_label, ip->hop_limit, + clib_net_to_host_u16 (ip->payload_length)); + + /* Recurse into next protocol layer. */ + if (max_header_bytes != 0 && sizeof (ip[0]) < max_header_bytes) + { + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi = ip_get_protocol_info (im, ip->protocol); + + if (pi && pi->format_header) + s = format (s, "\n%U%U", + format_white_space, indent - 2, + pi->format_header, + /* next protocol header */ (void*) (ip + 1), + max_header_bytes - sizeof (ip[0])); + } + + return s; +} + +/* Parse an IP6 header. */ +uword unformat_ip6_header (unformat_input_t * input, va_list * args) +{ + u8 ** result = va_arg (*args, u8 **); + ip6_header_t * ip; + int old_length; + + /* Allocate space for IP header. */ + { + void * p; + + old_length = vec_len (*result); + vec_add2 (*result, p, sizeof (ip[0])); + ip = p; + } + + memset (ip, 0, sizeof (ip[0])); + ip->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (6 << 28); + + if (! unformat (input, "%U: %U -> %U", + unformat_ip_protocol, &ip->protocol, + unformat_ip6_address, &ip->src_address, + unformat_ip6_address, &ip->dst_address)) + return 0; + + /* Parse options. */ + while (1) + { + int i; + + if (unformat (input, "tos %U", unformat_vlib_number, &i)) + ip->ip_version_traffic_class_and_flow_label |= clib_host_to_net_u32 ((i & 0xff) << 20); + + else if (unformat (input, "hop-limit %U", unformat_vlib_number, &i)) + ip->hop_limit = i; + + /* Can't parse input: try next protocol level. */ + else + break; + } + + /* Recurse into next protocol layer. */ + { + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi = ip_get_protocol_info (im, ip->protocol); + + if (pi && pi->unformat_header) + { + if (! unformat_user (input, pi->unformat_header, result)) + return 0; + + /* Result may have moved. */ + ip = (void *) *result + old_length; + } + } + + ip->payload_length = clib_host_to_net_u16 (vec_len (*result) - (old_length + sizeof (ip[0]))); + + return 1; +} diff --git a/vnet/vnet/ip/ip6_forward.c b/vnet/vnet/ip/ip6_forward.c new file mode 100644 index 00000000000..f0065e969f8 --- /dev/null +++ b/vnet/vnet/ip/ip6_forward.c @@ -0,0 +1,2724 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip6_forward.c: IP v6 forwarding + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/vnet.h> +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> /* for ethernet_header_t */ +#include <vnet/srp/srp.h> /* for srp_hw_interface_class */ +#include <vppinfra/cache.h> + +#include <vppinfra/bihash_template.c> + +static void compute_prefix_lengths_in_search_order (ip6_main_t * im) +{ + int i; + vec_reset_length (im->prefix_lengths_in_search_order); + /* Note: bitmap reversed so this is in fact a longest prefix match */ + clib_bitmap_foreach (i, im->non_empty_dst_address_length_bitmap, + ({ + int dst_address_length = 128 - i; + vec_add1 (im->prefix_lengths_in_search_order, dst_address_length); + })); +} + +u32 +ip6_fib_lookup_with_table (ip6_main_t * im, u32 fib_index, ip6_address_t * dst) +{ + ip_lookup_main_t * lm = &im->lookup_main; + int i, len; + int rv; + BVT(clib_bihash_kv) kv, value; + + len = vec_len (im->prefix_lengths_in_search_order); + + for (i = 0; i < len; i++) + { + int dst_address_length = im->prefix_lengths_in_search_order[i]; + ip6_address_t * mask = &im->fib_masks[dst_address_length]; + + ASSERT(dst_address_length >= 0 && dst_address_length <= 128); + + kv.key[0] = dst->as_u64[0] & mask->as_u64[0]; + kv.key[1] = dst->as_u64[1] & mask->as_u64[1]; + kv.key[2] = ((u64)((fib_index))<<32) | dst_address_length; + + rv = BV(clib_bihash_search_inline_2)(&im->ip6_lookup_table, &kv, &value); + if (rv == 0) + return value.value; + } + + return lm->miss_adj_index; +} + +u32 ip6_fib_lookup (ip6_main_t * im, u32 sw_if_index, ip6_address_t * dst) +{ + u32 fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index); + return ip6_fib_lookup_with_table (im, fib_index, dst); +} + +void +vnet_ip6_fib_init (ip6_main_t * im, u32 fib_index) +{ + ip_lookup_main_t * lm = &im->lookup_main; + ip6_add_del_route_args_t a; + ip_adjacency_t * adj; + + memset(&a, 0x0, sizeof(ip6_add_del_route_args_t)); + + a.table_index_or_table_id = fib_index; + a.flags = (IP6_ROUTE_FLAG_ADD + | IP6_ROUTE_FLAG_FIB_INDEX + | IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY + | IP6_ROUTE_FLAG_NO_REDISTRIBUTE); + + /* Add ff02::1:ff00:0/104 via local route for all tables. + This is required for neighbor discovery to work. */ + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &a.adj_index); + adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL; + adj->if_address_index = ~0; + adj->rewrite_header.data_bytes = 0; + + ip6_set_solicited_node_multicast_address (&a.dst_address, 0); + + a.dst_address_length = 104; + ip6_add_del_route (im, &a); + + /* Add all-routers multicast address via local route for all tables */ + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &a.adj_index); + adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL; + adj->if_address_index = ~0; + adj->rewrite_header.data_bytes = 0; + + ip6_set_reserved_multicast_address (&a.dst_address, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_all_routers); + + a.dst_address_length = 128; + ip6_add_del_route (im, &a); + + /* Add all-nodes multicast address via local route for all tables */ + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &a.adj_index); + adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL; + adj->if_address_index = ~0; + adj->rewrite_header.data_bytes = 0; + + ip6_set_reserved_multicast_address (&a.dst_address, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_all_hosts); + + a.dst_address_length = 128; + ip6_add_del_route (im, &a); + + /* Add all-mldv2 multicast address via local route for all tables */ + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &a.adj_index); + adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL; + adj->if_address_index = ~0; + adj->rewrite_header.data_bytes = 0; + + ip6_set_reserved_multicast_address (&a.dst_address, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_mldv2_routers); + + a.dst_address_length = 128; + ip6_add_del_route (im, &a); +} + +static ip6_fib_t * +create_fib_with_table_id (ip6_main_t * im, u32 table_id) +{ + ip6_fib_t * fib; + hash_set (im->fib_index_by_table_id, table_id, vec_len (im->fibs)); + vec_add2 (im->fibs, fib, 1); + fib->table_id = table_id; + fib->index = fib - im->fibs; + fib->flow_hash_config = IP_FLOW_HASH_DEFAULT; + vnet_ip6_fib_init (im, fib->index); + return fib; +} + +ip6_fib_t * +find_ip6_fib_by_table_index_or_id (ip6_main_t * im, u32 table_index_or_id, u32 flags) +{ + uword * p, fib_index; + + fib_index = table_index_or_id; + if (! (flags & IP6_ROUTE_FLAG_FIB_INDEX)) + { + p = hash_get (im->fib_index_by_table_id, table_index_or_id); + if (! p) + return create_fib_with_table_id (im, table_index_or_id); + fib_index = p[0]; + } + return vec_elt_at_index (im->fibs, fib_index); +} + +void ip6_add_del_route (ip6_main_t * im, ip6_add_del_route_args_t * a) +{ + ip_lookup_main_t * lm = &im->lookup_main; + ip6_fib_t * fib; + ip6_address_t dst_address; + u32 dst_address_length, adj_index; + uword is_del; + u32 old_adj_index = ~0; + BVT(clib_bihash_kv) kv, value; + + vlib_smp_unsafe_warning(); + + is_del = (a->flags & IP6_ROUTE_FLAG_DEL) != 0; + + /* Either create new adjacency or use given one depending on arguments. */ + if (a->n_add_adj > 0) + { + ip_add_adjacency (lm, a->add_adj, a->n_add_adj, &adj_index); + ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0); + } + else + adj_index = a->adj_index; + + dst_address = a->dst_address; + dst_address_length = a->dst_address_length; + fib = find_ip6_fib_by_table_index_or_id (im, a->table_index_or_table_id, + a->flags); + + ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks)); + ip6_address_mask (&dst_address, &im->fib_masks[dst_address_length]); + + /* refcount accounting */ + if (is_del) + { + ASSERT (im->dst_address_length_refcounts[dst_address_length] > 0); + if (--im->dst_address_length_refcounts[dst_address_length] == 0) + { + im->non_empty_dst_address_length_bitmap = + clib_bitmap_set (im->non_empty_dst_address_length_bitmap, + 128 - dst_address_length, 0); + compute_prefix_lengths_in_search_order (im); + } + } + else + { + im->dst_address_length_refcounts[dst_address_length]++; + + im->non_empty_dst_address_length_bitmap = + clib_bitmap_set (im->non_empty_dst_address_length_bitmap, + 128 - dst_address_length, 1); + compute_prefix_lengths_in_search_order (im); + } + + kv.key[0] = dst_address.as_u64[0]; + kv.key[1] = dst_address.as_u64[1]; + kv.key[2] = ((u64)((fib - im->fibs))<<32) | dst_address_length; + + if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) == 0) + old_adj_index = value.value; + + if (is_del) + BV(clib_bihash_add_del) (&im->ip6_lookup_table, &kv, 0 /* is_add */); + else + { + /* Make sure adj index is valid. */ + if (CLIB_DEBUG > 0) + (void) ip_get_adjacency (lm, adj_index); + + kv.value = adj_index; + + BV(clib_bihash_add_del) (&im->ip6_lookup_table, &kv, 1 /* is_add */); + } + + /* Delete old adjacency index if present and changed. */ + { + if (! (a->flags & IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY) + && old_adj_index != ~0 + && old_adj_index != adj_index) + ip_del_adjacency (lm, old_adj_index); + } +} + +void +ip6_add_del_route_next_hop (ip6_main_t * im, + u32 flags, + ip6_address_t * dst_address, + u32 dst_address_length, + ip6_address_t * next_hop, + u32 next_hop_sw_if_index, + u32 next_hop_weight, u32 adj_index, + u32 explicit_fib_index) +{ + vnet_main_t * vnm = vnet_get_main(); + ip_lookup_main_t * lm = &im->lookup_main; + u32 fib_index; + ip6_fib_t * fib; + ip6_address_t masked_dst_address; + u32 old_mp_adj_index, new_mp_adj_index; + u32 dst_adj_index, nh_adj_index; + int rv; + ip_adjacency_t * dst_adj; + ip_multipath_adjacency_t * old_mp, * new_mp; + int is_del = (flags & IP6_ROUTE_FLAG_DEL) != 0; + int is_interface_next_hop; + clib_error_t * error = 0; + uword * nh_result; + BVT(clib_bihash_kv) kv, value; + + vlib_smp_unsafe_warning(); + + if (explicit_fib_index == (u32)~0) + fib_index = vec_elt (im->fib_index_by_sw_if_index, next_hop_sw_if_index); + else + fib_index = explicit_fib_index; + + fib = vec_elt_at_index (im->fibs, fib_index); + + /* Lookup next hop to be added or deleted. */ + is_interface_next_hop = ip6_address_is_zero (next_hop); + if (adj_index == (u32)~0) + { + if (is_interface_next_hop) + { + nh_result = hash_get (im->interface_route_adj_index_by_sw_if_index, + next_hop_sw_if_index); + if (nh_result) + nh_adj_index = *nh_result; + else + { + ip_adjacency_t * adj; + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &nh_adj_index); + ip6_adjacency_set_interface_route (vnm, adj, + next_hop_sw_if_index, ~0); + ip_call_add_del_adjacency_callbacks + (lm, next_hop_sw_if_index, /* is_del */ 0); + hash_set (im->interface_route_adj_index_by_sw_if_index, + next_hop_sw_if_index, nh_adj_index); + } + } + else + { + /* Look for the interface /128 route */ + kv.key[0] = next_hop->as_u64[0]; + kv.key[1] = next_hop->as_u64[1]; + kv.key[2] = ((u64)((fib - im->fibs))<<32) | 128; + + if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) < 0) + { + vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION; + error = clib_error_return (0, "next-hop %U/128 not in FIB", + format_ip6_address, next_hop); + goto done; + } + + nh_adj_index = value.value; + } + } + else + { + /* Look for the interface /128 route */ + kv.key[0] = next_hop->as_u64[0]; + kv.key[1] = next_hop->as_u64[1]; + kv.key[2] = ((u64)((fib - im->fibs))<<32) | 128; + + if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) < 0) + { + vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION; + error = clib_error_return (0, "next-hop %U/128 not in FIB", + format_ip6_address, next_hop); + goto done; + } + + nh_adj_index = value.value; + } + + ASSERT (dst_address_length < ARRAY_LEN (im->fib_masks)); + masked_dst_address = dst_address[0]; + ip6_address_mask (&masked_dst_address, &im->fib_masks[dst_address_length]); + + kv.key[0] = masked_dst_address.as_u64[0]; + kv.key[1] = masked_dst_address.as_u64[1]; + kv.key[2] = ((u64)((fib - im->fibs))<<32) | dst_address_length; + + rv = BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value); + + if (rv == 0) + { + dst_adj_index = value.value; + dst_adj = ip_get_adjacency (lm, dst_adj_index); + } + else + { + /* For deletes destination must be known. */ + if (is_del) + { + vnm->api_errno = VNET_API_ERROR_UNKNOWN_DESTINATION; + error = clib_error_return (0, "unknown destination %U/%d", + format_ip6_address, dst_address, + dst_address_length); + goto done; + } + + dst_adj_index = ~0; + dst_adj = 0; + } + + /* Ignore adds of X/128 with next hop of X. */ + if (! is_del + && dst_address_length == 128 + && ip6_address_is_equal (dst_address, next_hop)) + { + vnm->api_errno = VNET_API_ERROR_PREFIX_MATCHES_NEXT_HOP; + error = clib_error_return (0, "prefix matches next hop %U/%d", + format_ip6_address, dst_address, + dst_address_length); + goto done; + } + + old_mp_adj_index = dst_adj ? dst_adj->heap_handle : ~0; + + if (! ip_multipath_adjacency_add_del_next_hop + (lm, is_del, + dst_adj ? dst_adj->heap_handle : ~0, + nh_adj_index, + next_hop_weight, + &new_mp_adj_index)) + { + vnm->api_errno = VNET_API_ERROR_NEXT_HOP_NOT_FOUND_MP; + error = clib_error_return + (0, "requested deleting next-hop %U not found in multi-path", + format_ip6_address, next_hop); + goto done; + } + + old_mp = new_mp = 0; + if (old_mp_adj_index != ~0) + old_mp = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index); + if (new_mp_adj_index != ~0) + new_mp = vec_elt_at_index (lm->multipath_adjacencies, new_mp_adj_index); + + if (old_mp != new_mp) + { + ip6_add_del_route_args_t a; + a.table_index_or_table_id = fib_index; + a.flags = ((is_del ? IP6_ROUTE_FLAG_DEL : IP6_ROUTE_FLAG_ADD) + | IP6_ROUTE_FLAG_FIB_INDEX + | IP6_ROUTE_FLAG_KEEP_OLD_ADJACENCY + | (flags & IP6_ROUTE_FLAG_NO_REDISTRIBUTE)); + a.dst_address = dst_address[0]; + a.dst_address_length = dst_address_length; + a.adj_index = new_mp ? new_mp->adj_index : dst_adj_index; + a.add_adj = 0; + a.n_add_adj = 0; + + ip6_add_del_route (im, &a); + } + + done: + if (error) + clib_error_report (error); +} + +u32 +ip6_get_route (ip6_main_t * im, + u32 table_index_or_table_id, + u32 flags, + ip6_address_t * address, + u32 address_length) +{ + ip6_fib_t * fib = find_ip6_fib_by_table_index_or_id (im, table_index_or_table_id, flags); + ip6_address_t masked_address; + BVT(clib_bihash_kv) kv, value; + + ASSERT (address_length < ARRAY_LEN (im->fib_masks)); + memcpy (&masked_address, address, sizeof (masked_address)); + ip6_address_mask (&masked_address, &im->fib_masks[address_length]); + + kv.key[0] = masked_address.as_u64[0]; + kv.key[1] = masked_address.as_u64[1]; + kv.key[2] = ((u64)((fib - im->fibs))<<32) | address_length; + + if (BV(clib_bihash_search)(&im->ip6_lookup_table, &kv, &value) == 0) + return (value.value); + return 0; +} + +void +ip6_foreach_matching_route (ip6_main_t * im, + u32 table_index_or_table_id, + u32 flags, + ip6_address_t * dst_address, + u32 address_length, + ip6_address_t ** results, + u8 ** result_lengths) +{ + ip6_fib_t * fib = + find_ip6_fib_by_table_index_or_id (im, table_index_or_table_id, flags); + BVT(clib_bihash) * h = &im->ip6_lookup_table; + BVT(clib_bihash_value) * v; + clib_bihash_bucket_t * b; + int i, j, k; + + if (*results) + _vec_len (*results) = 0; + if (*result_lengths) + _vec_len (*result_lengths) = 0; + + /* Walk the table looking for routes which match the supplied address */ + for (i = 0; i < h->nbuckets; i++) + { + b = &h->buckets [i]; + if (b->offset == 0) + continue; + + v = BV(clib_bihash_get_value) (h, b->offset); + for (j = 0; j < (1<<b->log2_pages); j++) + { + for (k = 0; k < BIHASH_KVP_PER_PAGE; k++) + { + if (BV(clib_bihash_is_free)(&v->kvp[k])) + continue; + + if ((v->kvp[k].key[2] + == (((u64)((fib - im->fibs))<<32) | address_length)) + && ip6_destination_matches_route + (im, dst_address, (ip6_address_t *) &v->kvp[k], + address_length)) + { + ip6_address_t * a; + + a = (ip6_address_t *)(&v->kvp[k]); + + vec_add1 (*results, a[0]); + vec_add1 (*result_lengths, address_length); + } + } + v++; + } + } +} + +void ip6_maybe_remap_adjacencies (ip6_main_t * im, + u32 table_index_or_table_id, + u32 flags) +{ +#if SOONE + ip6_fib_t * fib + = find_ip6_fib_by_table_index_or_id (im, table_index_or_table_id, flags); +#endif + ip_lookup_main_t * lm = &im->lookup_main; + + if (lm->n_adjacency_remaps == 0) + return; + + clib_warning ("unimplemented, please report to vpp-dev@cisco.com"); + + /* All remaps have been performed. */ + lm->n_adjacency_remaps = 0; +} + +void ip6_delete_matching_routes (ip6_main_t * im, + u32 table_index_or_table_id, + u32 flags, + ip6_address_t * address, + u32 address_length) +{ + /* $$$$ static may be OK - this should happen only on thread 0 */ + static ip6_address_t * matching_addresses; + static u8 * matching_address_lengths; + u32 l, i; + ip6_add_del_route_args_t a; + + vlib_smp_unsafe_warning(); + + a.flags = IP6_ROUTE_FLAG_DEL | IP6_ROUTE_FLAG_NO_REDISTRIBUTE | flags; + a.table_index_or_table_id = table_index_or_table_id; + a.adj_index = ~0; + a.add_adj = 0; + a.n_add_adj = 0; + + for (l = address_length + 1; l <= 128; l++) + { + ip6_foreach_matching_route (im, table_index_or_table_id, flags, + address, + l, + &matching_addresses, + &matching_address_lengths); + for (i = 0; i < vec_len (matching_addresses); i++) + { + a.dst_address = matching_addresses[i]; + a.dst_address_length = matching_address_lengths[i]; + ip6_add_del_route (im, &a); + } + } + + ip6_maybe_remap_adjacencies (im, table_index_or_table_id, flags); +} + +static uword +ip6_lookup (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + vlib_combined_counter_main_t * cm = &im->lookup_main.adjacency_counters; + u32 n_left_from, n_left_to_next, * from, * to_next; + ip_lookup_next_t next; + u32 cpu_index = os_get_cpu_number(); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next = node->cached_next_index; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t * p0, * p1; + u32 pi0, pi1, adj_index0, adj_index1, wrong_next; + ip_lookup_next_t next0, next1; + ip6_header_t * ip0, * ip1; + ip_adjacency_t * adj0, * adj1; + u32 fib_index0, fib_index1; + u32 flow_hash_config0, flow_hash_config1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (ip0[0]), LOAD); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]); + fib_index1 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p1)->sw_if_index[VLIB_RX]); + + fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ? + fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX]; + fib_index1 = (vnet_buffer(p1)->sw_if_index[VLIB_TX] == (u32)~0) ? + fib_index1 : vnet_buffer(p1)->sw_if_index[VLIB_TX]; + + adj_index0 = ip6_fib_lookup_with_table (im, fib_index0, + &ip0->dst_address); + adj_index1 = ip6_fib_lookup_with_table (im, fib_index1, + &ip1->dst_address); + + adj0 = ip_get_adjacency (lm, adj_index0); + adj1 = ip_get_adjacency (lm, adj_index1); + + if (PREDICT_FALSE (adj0->explicit_fib_index != ~0)) + { + adj_index0 = ip6_fib_lookup_with_table + (im, adj0->explicit_fib_index, &ip0->dst_address); + adj0 = ip_get_adjacency (lm, adj_index0); + } + if (PREDICT_FALSE (adj1->explicit_fib_index != ~0)) + { + adj_index1 = ip6_fib_lookup_with_table + (im, adj1->explicit_fib_index, &ip1->dst_address); + adj1 = ip_get_adjacency (lm, adj_index1); + } + + next0 = adj0->lookup_next_index; + next1 = adj1->lookup_next_index; + + /* Process hop-by-hop options if present */ + next0 = (ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) ? + IP_LOOKUP_NEXT_HOP_BY_HOP : next0; + next1 = (ip1->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) ? + IP_LOOKUP_NEXT_HOP_BY_HOP : next1; + + vnet_buffer (p0)->ip.flow_hash = + vnet_buffer(p1)->ip.flow_hash = 0; + + if (PREDICT_FALSE(adj0->n_adj > 1)) + { + flow_hash_config0 = + vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config; + vnet_buffer (p0)->ip.flow_hash = + ip6_compute_flow_hash (ip0, flow_hash_config0); + } + + if (PREDICT_FALSE(adj1->n_adj > 1)) + { + flow_hash_config1 = + vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config; + + vnet_buffer (p1)->ip.flow_hash = + ip6_compute_flow_hash (ip1, flow_hash_config1); + } + + ASSERT (adj0->n_adj > 0); + ASSERT (adj1->n_adj > 0); + ASSERT (is_pow2 (adj0->n_adj)); + ASSERT (is_pow2 (adj1->n_adj)); + adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1)); + adj_index1 += (vnet_buffer (p1)->ip.flow_hash & (adj1->n_adj - 1)); + + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0; + vnet_buffer (p1)->ip.adj_index[VLIB_TX] = adj_index1; + + vlib_increment_combined_counter + (cm, cpu_index, adj_index0, 1, + vlib_buffer_length_in_chain (vm, p0)); + vlib_increment_combined_counter + (cm, cpu_index, adj_index1, 1, + vlib_buffer_length_in_chain (vm, p1)); + + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + wrong_next = (next0 != next) + 2*(next1 != next); + if (PREDICT_FALSE (wrong_next != 0)) + { + switch (wrong_next) + { + case 1: + /* A B A */ + to_next[-2] = pi1; + to_next -= 1; + n_left_to_next += 1; + vlib_set_next_frame_buffer (vm, node, next0, pi0); + break; + + case 2: + /* A A B */ + to_next -= 1; + n_left_to_next += 1; + vlib_set_next_frame_buffer (vm, node, next1, pi1); + break; + + case 3: + /* A B C */ + to_next -= 2; + n_left_to_next += 2; + vlib_set_next_frame_buffer (vm, node, next0, pi0); + vlib_set_next_frame_buffer (vm, node, next1, pi1); + if (next0 == next1) + { + /* A B B */ + vlib_put_next_frame (vm, node, next, n_left_to_next); + next = next1; + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + } + } + } + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip0; + u32 pi0, adj_index0; + ip_lookup_next_t next0; + ip_adjacency_t * adj0; + u32 fib_index0, flow_hash_config0; + + pi0 = from[0]; + to_next[0] = pi0; + + p0 = vlib_get_buffer (vm, pi0); + + ip0 = vlib_buffer_get_current (p0); + + fib_index0 = vec_elt (im->fib_index_by_sw_if_index, vnet_buffer (p0)->sw_if_index[VLIB_RX]); + fib_index0 = (vnet_buffer(p0)->sw_if_index[VLIB_TX] == (u32)~0) ? + fib_index0 : vnet_buffer(p0)->sw_if_index[VLIB_TX]; + + flow_hash_config0 = + vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config; + + adj_index0 = ip6_fib_lookup_with_table (im, fib_index0, + &ip0->dst_address); + + adj0 = ip_get_adjacency (lm, adj_index0); + + if (PREDICT_FALSE (adj0->explicit_fib_index != ~0)) + { + adj_index0 = ip6_fib_lookup_with_table + (im, adj0->explicit_fib_index, &ip0->dst_address); + adj0 = ip_get_adjacency (lm, adj_index0); + } + + next0 = adj0->lookup_next_index; + next0 = (ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) ? + IP_LOOKUP_NEXT_HOP_BY_HOP : next0; + + vnet_buffer (p0)->ip.flow_hash = 0; + + if (PREDICT_FALSE(adj0->n_adj > 1)) + { + flow_hash_config0 = + vec_elt_at_index (im->fibs,fib_index0)->flow_hash_config; + vnet_buffer (p0)->ip.flow_hash = + ip6_compute_flow_hash (ip0, flow_hash_config0); + } + + ASSERT (adj0->n_adj > 0); + ASSERT (is_pow2 (adj0->n_adj)); + adj_index0 += (vnet_buffer (p0)->ip.flow_hash & (adj0->n_adj - 1)); + + vnet_buffer (p0)->ip.adj_index[VLIB_TX] = adj_index0; + + vlib_increment_combined_counter + (cm, cpu_index, adj_index0, 1, + vlib_buffer_length_in_chain (vm, p0)); + + from += 1; + to_next += 1; + n_left_to_next -= 1; + n_left_from -= 1; + + if (PREDICT_FALSE (next0 != next)) + { + n_left_to_next += 1; + vlib_put_next_frame (vm, node, next, n_left_to_next); + next = next0; + vlib_get_next_frame (vm, node, next, + to_next, n_left_to_next); + to_next[0] = pi0; + to_next += 1; + n_left_to_next -= 1; + } + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + return frame->n_vectors; +} + +void ip6_adjacency_set_interface_route (vnet_main_t * vnm, + ip_adjacency_t * adj, + u32 sw_if_index, + u32 if_address_index) +{ + vnet_hw_interface_t * hw = vnet_get_sup_hw_interface (vnm, sw_if_index); + ip_lookup_next_t n; + u32 node_index; + + if (hw->hw_class_index == ethernet_hw_interface_class.index + || hw->hw_class_index == srp_hw_interface_class.index) + { + n = IP_LOOKUP_NEXT_ARP; + node_index = ip6_discover_neighbor_node.index; + adj->if_address_index = if_address_index; + } + else + { + n = IP_LOOKUP_NEXT_REWRITE; + node_index = ip6_rewrite_node.index; + } + + adj->lookup_next_index = n; + adj->explicit_fib_index = ~0; + + vnet_rewrite_for_sw_interface + (vnm, + VNET_L3_PACKET_TYPE_IP6, + sw_if_index, + node_index, + VNET_REWRITE_FOR_SW_INTERFACE_ADDRESS_BROADCAST, + &adj->rewrite_header, + sizeof (adj->rewrite_data)); +} + +static void +ip6_add_interface_routes (vnet_main_t * vnm, u32 sw_if_index, + ip6_main_t * im, u32 fib_index, + ip_interface_address_t * a) +{ + ip_lookup_main_t * lm = &im->lookup_main; + ip_adjacency_t * adj; + ip6_address_t * address = ip_interface_address_get_address (lm, a); + ip6_add_del_route_args_t x; + vnet_hw_interface_t * hw_if = vnet_get_sup_hw_interface (vnm, sw_if_index); + u32 classify_table_index; + + /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */ + x.table_index_or_table_id = fib_index; + x.flags = (IP6_ROUTE_FLAG_ADD + | IP6_ROUTE_FLAG_FIB_INDEX + | IP6_ROUTE_FLAG_NO_REDISTRIBUTE); + x.dst_address = address[0]; + x.dst_address_length = a->address_length; + x.n_add_adj = 0; + x.add_adj = 0; + + a->neighbor_probe_adj_index = ~0; + if (a->address_length < 128) + { + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &x.adj_index); + ip6_adjacency_set_interface_route (vnm, adj, sw_if_index, a - lm->if_address_pool); + ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0); + ip6_add_del_route (im, &x); + a->neighbor_probe_adj_index = x.adj_index; + } + + /* Add e.g. ::1/128 as local to this host. */ + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &x.adj_index); + + classify_table_index = ~0; + if (sw_if_index < vec_len (lm->classify_table_index_by_sw_if_index)) + classify_table_index = lm->classify_table_index_by_sw_if_index [sw_if_index]; + if (classify_table_index != (u32) ~0) + { + adj->lookup_next_index = IP_LOOKUP_NEXT_CLASSIFY; + adj->classify_table_index = classify_table_index; + } + else + adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL; + + adj->if_address_index = a - lm->if_address_pool; + adj->rewrite_header.sw_if_index = sw_if_index; + adj->rewrite_header.max_l3_packet_bytes = hw_if->max_l3_packet_bytes[VLIB_RX]; + adj->rewrite_header.data_bytes = 0; + ip_call_add_del_adjacency_callbacks (lm, x.adj_index, /* is_del */ 0); + x.dst_address_length = 128; + ip6_add_del_route (im, &x); +} + +static void +ip6_del_interface_routes (ip6_main_t * im, u32 fib_index, + ip6_address_t * address, u32 address_length) +{ + ip6_add_del_route_args_t x; + + /* Add e.g. 1.0.0.0/8 as interface route (arp for Ethernet). */ + x.table_index_or_table_id = fib_index; + x.flags = (IP6_ROUTE_FLAG_DEL + | IP6_ROUTE_FLAG_FIB_INDEX + | IP6_ROUTE_FLAG_NO_REDISTRIBUTE); + x.dst_address = address[0]; + x.dst_address_length = address_length; + x.adj_index = ~0; + x.n_add_adj = 0; + x.add_adj = 0; + + if (address_length < 128) + { + /* Don't wipe out fe80::0/64 */ + if (address_length != 64 || + address[0].as_u64[0] != clib_net_to_host_u64(0xfe80000000000000ULL)) + ip6_add_del_route (im, &x); + } + + x.dst_address_length = 128; + ip6_add_del_route (im, &x); + + ip6_delete_matching_routes (im, + fib_index, + IP6_ROUTE_FLAG_FIB_INDEX, + address, + address_length); +} + +typedef struct { + u32 sw_if_index; + ip6_address_t address; + u32 length; +} ip6_interface_address_t; + +static clib_error_t * +ip6_add_del_interface_address_internal (vlib_main_t * vm, + u32 sw_if_index, + ip6_address_t * new_address, + u32 new_length, + u32 redistribute, + u32 insert_routes, + u32 is_del); + +static clib_error_t * +ip6_add_del_interface_address_internal (vlib_main_t * vm, + u32 sw_if_index, + ip6_address_t * address, + u32 address_length, + u32 redistribute, + u32 insert_routes, + u32 is_del) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + clib_error_t * error; + u32 if_address_index; + ip6_address_fib_t ip6_af, * addr_fib = 0; + + vec_validate (im->fib_index_by_sw_if_index, sw_if_index); + ip6_addr_fib_init (&ip6_af, address, + vec_elt (im->fib_index_by_sw_if_index, sw_if_index)); + vec_add1 (addr_fib, ip6_af); + + { + uword elts_before = pool_elts (lm->if_address_pool); + + error = ip_interface_address_add_del + (lm, + sw_if_index, + addr_fib, + address_length, + is_del, + &if_address_index); + if (error) + goto done; + + /* Pool did not grow: add duplicate address. */ + if (elts_before == pool_elts (lm->if_address_pool)) + goto done; + } + + if (vnet_sw_interface_is_admin_up (vnm, sw_if_index) && insert_routes) + { + if (is_del) + ip6_del_interface_routes (im, ip6_af.fib_index, address, + address_length); + + else + ip6_add_interface_routes (vnm, sw_if_index, + im, ip6_af.fib_index, + pool_elt_at_index (lm->if_address_pool, if_address_index)); + } + + { + ip6_add_del_interface_address_callback_t * cb; + vec_foreach (cb, im->add_del_interface_address_callbacks) + cb->function (im, cb->function_opaque, sw_if_index, + address, address_length, + if_address_index, + is_del); + } + + done: + vec_free (addr_fib); + return error; +} + +clib_error_t * +ip6_add_del_interface_address (vlib_main_t * vm, u32 sw_if_index, + ip6_address_t * address, u32 address_length, + u32 is_del) +{ + return ip6_add_del_interface_address_internal + (vm, sw_if_index, address, address_length, + /* redistribute */ 1, + /* insert_routes */ 1, + is_del); +} + +clib_error_t * +ip6_sw_interface_admin_up_down (vnet_main_t * vnm, + u32 sw_if_index, + u32 flags) +{ + ip6_main_t * im = &ip6_main; + ip_interface_address_t * ia; + ip6_address_t * a; + u32 is_admin_up, fib_index; + + /* Fill in lookup tables with default table (0). */ + vec_validate (im->fib_index_by_sw_if_index, sw_if_index); + + vec_validate_init_empty (im->lookup_main.if_address_pool_index_by_sw_if_index, sw_if_index, ~0); + + is_admin_up = (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP) != 0; + + fib_index = vec_elt (im->fib_index_by_sw_if_index, sw_if_index); + + foreach_ip_interface_address (&im->lookup_main, ia, sw_if_index, + 0 /* honor unnumbered */, + ({ + a = ip_interface_address_get_address (&im->lookup_main, ia); + if (is_admin_up) + ip6_add_interface_routes (vnm, sw_if_index, + im, fib_index, + ia); + else + ip6_del_interface_routes (im, fib_index, + a, ia->address_length); + })); + + return 0; +} + +VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip6_sw_interface_admin_up_down); + +clib_error_t * +ip6_sw_interface_add_del (vnet_main_t * vnm, + u32 sw_if_index, + u32 is_add) +{ + vlib_main_t * vm = vnm->vlib_main; + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + u32 ci, cast; + + for (cast = 0; cast < VNET_N_CAST; cast++) + { + ip_config_main_t * cm = &lm->rx_config_mains[cast]; + vnet_config_main_t * vcm = &cm->config_main; + + /* FIXME multicast. */ + if (! vcm->node_index_by_feature_index) + { + char * start_nodes[] = { "ip6-input", }; + char * feature_nodes[] = { + [IP6_RX_FEATURE_CHECK_ACCESS] = "ip6-inacl", + [IP6_RX_FEATURE_IPSEC] = "ipsec-input-ip6", + [IP6_RX_FEATURE_L2TPV3] = "l2tp-decap", + [IP6_RX_FEATURE_VPATH] = "vpath-input-ip6", + [IP6_RX_FEATURE_LOOKUP] = "ip6-lookup", + }; + vnet_config_init (vm, vcm, + start_nodes, ARRAY_LEN (start_nodes), + feature_nodes, ARRAY_LEN (feature_nodes)); + } + + vec_validate_init_empty (cm->config_index_by_sw_if_index, sw_if_index, ~0); + ci = cm->config_index_by_sw_if_index[sw_if_index]; + + if (is_add) + ci = vnet_config_add_feature (vm, vcm, + ci, + IP6_RX_FEATURE_LOOKUP, + /* config data */ 0, + /* # bytes of config data */ 0); + else + ci = vnet_config_del_feature (vm, vcm, + ci, + IP6_RX_FEATURE_LOOKUP, + /* config data */ 0, + /* # bytes of config data */ 0); + + cm->config_index_by_sw_if_index[sw_if_index] = ci; + } + return /* no error */ 0; +} + +VNET_SW_INTERFACE_ADD_DEL_FUNCTION (ip6_sw_interface_add_del); + +VLIB_REGISTER_NODE (ip6_lookup_node) = { + .function = ip6_lookup, + .name = "ip6-lookup", + .vector_size = sizeof (u32), + + .n_next_nodes = IP_LOOKUP_N_NEXT, + .next_nodes = { + [IP_LOOKUP_NEXT_MISS] = "ip6-miss", + [IP_LOOKUP_NEXT_DROP] = "ip6-drop", + [IP_LOOKUP_NEXT_PUNT] = "ip6-punt", + [IP_LOOKUP_NEXT_LOCAL] = "ip6-local", + [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor", + [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite", + [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify", + [IP_LOOKUP_NEXT_MAP] = "ip6-map", + [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t", + [IP_LOOKUP_NEXT_SIXRD] = "ip6-sixrd", + [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop", + [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop", + [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip6-pop-hop-by-hop", + }, +}; + +typedef struct { + /* Adjacency taken. */ + u32 adj_index; + u32 flow_hash; + + /* Packet data, possibly *after* rewrite. */ + u8 packet_data[64 - 1*sizeof(u32)]; +} ip6_forward_next_trace_t; + +static u8 * format_ip6_forward_next_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip6_forward_next_trace_t * t = va_arg (*args, ip6_forward_next_trace_t *); + vnet_main_t * vnm = vnet_get_main(); + ip6_main_t * im = &ip6_main; + ip_adjacency_t * adj; + uword indent = format_get_indent (s); + + adj = ip_get_adjacency (&im->lookup_main, t->adj_index); + s = format (s, "adjacency: %U flow hash: 0x%08x", + format_ip_adjacency, + vnm, &im->lookup_main, t->adj_index, t->flow_hash); + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_REWRITE: + s = format (s, "\n%U%U", + format_white_space, indent, + format_ip_adjacency_packet_data, + vnm, &im->lookup_main, t->adj_index, + t->packet_data, sizeof (t->packet_data)); + break; + + default: + break; + } + + return s; +} + +/* Common trace function for all ip6-forward next nodes. */ +void +ip6_forward_next_trace (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + vlib_rx_or_tx_t which_adj_index) +{ + u32 * from, n_left; + + n_left = frame->n_vectors; + from = vlib_frame_vector_args (frame); + + while (n_left >= 4) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + ip6_forward_next_trace_t * t0, * t1; + + /* Prefetch next iteration. */ + vlib_prefetch_buffer_with_index (vm, from[2], LOAD); + vlib_prefetch_buffer_with_index (vm, from[3], LOAD); + + bi0 = from[0]; + bi1 = from[1]; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index]; + t0->flow_hash = vnet_buffer (b0)->ip.flow_hash; + memcpy (t0->packet_data, + vlib_buffer_get_current (b0), + sizeof (t0->packet_data)); + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + t1 = vlib_add_trace (vm, node, b1, sizeof (t1[0])); + t1->adj_index = vnet_buffer (b1)->ip.adj_index[which_adj_index]; + t1->flow_hash = vnet_buffer (b1)->ip.flow_hash; + memcpy (t1->packet_data, + vlib_buffer_get_current (b1), + sizeof (t1->packet_data)); + } + from += 2; + n_left -= 2; + } + + while (n_left >= 1) + { + u32 bi0; + vlib_buffer_t * b0; + ip6_forward_next_trace_t * t0; + + bi0 = from[0]; + + b0 = vlib_get_buffer (vm, bi0); + + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + t0 = vlib_add_trace (vm, node, b0, sizeof (t0[0])); + t0->adj_index = vnet_buffer (b0)->ip.adj_index[which_adj_index]; + t0->flow_hash = vnet_buffer (b0)->ip.flow_hash; + memcpy (t0->packet_data, + vlib_buffer_get_current (b0), + sizeof (t0->packet_data)); + } + from += 1; + n_left -= 1; + } +} + +static uword +ip6_drop_or_punt (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + ip6_error_t error_code) +{ + u32 * buffers = vlib_frame_vector_args (frame); + uword n_packets = frame->n_vectors; + + vlib_error_drop_buffers (vm, node, + buffers, + /* stride */ 1, + n_packets, + /* next */ 0, + ip6_input_node.index, + error_code); + + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip6_forward_next_trace (vm, node, frame, VLIB_TX); + + return n_packets; +} + +static uword +ip6_drop (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip6_drop_or_punt (vm, node, frame, IP6_ERROR_ADJACENCY_DROP); } + +static uword +ip6_punt (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip6_drop_or_punt (vm, node, frame, IP6_ERROR_ADJACENCY_PUNT); } + +static uword +ip6_miss (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip6_drop_or_punt (vm, node, frame, IP6_ERROR_DST_LOOKUP_MISS); } + +VLIB_REGISTER_NODE (ip6_drop_node,static) = { + .function = ip6_drop, + .name = "ip6-drop", + .vector_size = sizeof (u32), + + .format_trace = format_ip6_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; + +VLIB_REGISTER_NODE (ip6_punt_node,static) = { + .function = ip6_punt, + .name = "ip6-punt", + .vector_size = sizeof (u32), + + .format_trace = format_ip6_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-punt", + }, +}; + +VLIB_REGISTER_NODE (ip6_miss_node,static) = { + .function = ip6_miss, + .name = "ip6-miss", + .vector_size = sizeof (u32), + + .format_trace = format_ip6_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; + +VLIB_REGISTER_NODE (ip6_multicast_node,static) = { + .function = ip6_drop, + .name = "ip6-multicast", + .vector_size = sizeof (u32), + + .format_trace = format_ip6_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; + +/* Compute TCP/UDP/ICMP6 checksum in software. */ +u16 ip6_tcp_udp_icmp_compute_checksum (vlib_main_t * vm, vlib_buffer_t * p0, ip6_header_t * ip0, int *bogus_lengthp) +{ + ip_csum_t sum0; + u16 sum16, payload_length_host_byte_order; + u32 i, n_this_buffer, n_bytes_left; + u32 headers_size = sizeof(ip0[0]); + void * data_this_buffer; + + ASSERT(bogus_lengthp); + *bogus_lengthp = 0; + + /* Initialize checksum with ip header. */ + sum0 = ip0->payload_length + clib_host_to_net_u16 (ip0->protocol); + payload_length_host_byte_order = clib_net_to_host_u16 (ip0->payload_length); + data_this_buffer = (void *) (ip0 + 1); + + for (i = 0; i < ARRAY_LEN (ip0->src_address.as_uword); i++) + { + sum0 = ip_csum_with_carry (sum0, + clib_mem_unaligned (&ip0->src_address.as_uword[i], uword)); + sum0 = ip_csum_with_carry (sum0, + clib_mem_unaligned (&ip0->dst_address.as_uword[i], uword)); + } + + /* some icmp packets may come with a "router alert" hop-by-hop extension header (e.g., mldv2 packets) */ + if (PREDICT_FALSE (ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS)) + { + u32 skip_bytes; + ip6_hop_by_hop_ext_t *ext_hdr = (ip6_hop_by_hop_ext_t *)data_this_buffer; + + /* validate really icmp6 next */ + ASSERT(ext_hdr->next_hdr == IP_PROTOCOL_ICMP6); + + skip_bytes = 8* (1 + ext_hdr->n_data_u64s); + data_this_buffer = (void *)((u8 *)data_this_buffer + skip_bytes); + + payload_length_host_byte_order -= skip_bytes; + headers_size += skip_bytes; + } + + n_bytes_left = n_this_buffer = payload_length_host_byte_order; +#if DPDK > 0 + if (p0) + { + struct rte_mbuf *mb = ((struct rte_mbuf *)p0)-1; + u8 nb_segs = mb->nb_segs; + + n_this_buffer = (p0->current_length > headers_size ? + p0->current_length - headers_size : 0); + while (n_bytes_left) + { + sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer); + n_bytes_left -= n_this_buffer; + + mb = mb->next; + nb_segs--; + if ((nb_segs == 0) || (mb == 0)) + break; + + data_this_buffer = rte_ctrlmbuf_data(mb); + n_this_buffer = mb->data_len; + } + if (n_bytes_left || nb_segs) + { + *bogus_lengthp = 1; + return 0xfefe; + } + } + else sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer); +#else + if (p0 && n_this_buffer + headers_size > p0->current_length) + n_this_buffer = p0->current_length > headers_size ? p0->current_length - headers_size : 0; + while (1) + { + sum0 = ip_incremental_checksum (sum0, data_this_buffer, n_this_buffer); + n_bytes_left -= n_this_buffer; + if (n_bytes_left == 0) + break; + + if (!(p0->flags & VLIB_BUFFER_NEXT_PRESENT)) + { + *bogus_lengthp = 1; + return 0xfefe; + } + p0 = vlib_get_buffer (vm, p0->next_buffer); + data_this_buffer = vlib_buffer_get_current (p0); + n_this_buffer = p0->current_length; + } +#endif /* DPDK */ + + sum16 = ~ ip_csum_fold (sum0); + + return sum16; +} + +u32 ip6_tcp_udp_icmp_validate_checksum (vlib_main_t * vm, vlib_buffer_t * p0) +{ + ip6_header_t * ip0 = vlib_buffer_get_current (p0); + udp_header_t * udp0; + u16 sum16; + int bogus_length; + + /* some icmp packets may come with a "router alert" hop-by-hop extension header (e.g., mldv2 packets) */ + ASSERT (ip0->protocol == IP_PROTOCOL_TCP + || ip0->protocol == IP_PROTOCOL_ICMP6 + || ip0->protocol == IP_PROTOCOL_UDP + || ip0->protocol == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS); + + udp0 = (void *) (ip0 + 1); + if (ip0->protocol == IP_PROTOCOL_UDP && udp0->checksum == 0) + { + p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED + | IP_BUFFER_L4_CHECKSUM_CORRECT); + return p0->flags; + } + + sum16 = ip6_tcp_udp_icmp_compute_checksum (vm, p0, ip0, &bogus_length); + + p0->flags |= (IP_BUFFER_L4_CHECKSUM_COMPUTED + | ((sum16 == 0) << LOG2_IP_BUFFER_L4_CHECKSUM_CORRECT)); + + return p0->flags; +} + +static uword +ip6_local (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + ip_local_next_t next_index; + u32 * from, * to_next, n_left_from, n_left_to_next; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_input_node.index); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip6_forward_next_trace (vm, node, frame, VLIB_TX); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t * p0, * p1; + ip6_header_t * ip0, * ip1; + udp_header_t * udp0, * udp1; + u32 pi0, ip_len0, udp_len0, flags0, next0; + u32 pi1, ip_len1, udp_len1, flags1, next1; + i32 len_diff0, len_diff1; + u8 error0, type0, good_l4_checksum0; + u8 error1, type1, good_l4_checksum1; + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol]; + type1 = lm->builtin_protocol_by_ip_protocol[ip1->protocol]; + + next0 = lm->local_next_by_ip_protocol[ip0->protocol]; + next1 = lm->local_next_by_ip_protocol[ip1->protocol]; + + flags0 = p0->flags; + flags1 = p1->flags; + + good_l4_checksum0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + good_l4_checksum1 = (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + + udp0 = ip6_next_header (ip0); + udp1 = ip6_next_header (ip1); + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UDP && udp0->checksum == 0; + good_l4_checksum1 |= type1 == IP_BUILTIN_PROTOCOL_UDP && udp1->checksum == 0; + + good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UNKNOWN; + good_l4_checksum1 |= type1 == IP_BUILTIN_PROTOCOL_UNKNOWN; + + /* Verify UDP length. */ + ip_len0 = clib_net_to_host_u16 (ip0->payload_length); + ip_len1 = clib_net_to_host_u16 (ip1->payload_length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + udp_len1 = clib_net_to_host_u16 (udp1->length); + + len_diff0 = ip_len0 - udp_len0; + len_diff1 = ip_len1 - udp_len1; + + len_diff0 = type0 == IP_BUILTIN_PROTOCOL_UDP ? len_diff0 : 0; + len_diff1 = type1 == IP_BUILTIN_PROTOCOL_UDP ? len_diff1 : 0; + + if (PREDICT_FALSE (type0 != IP_BUILTIN_PROTOCOL_UNKNOWN + && ! good_l4_checksum0 + && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))) + { + flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, p0); + good_l4_checksum0 = + (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + } + if (PREDICT_FALSE (type1 != IP_BUILTIN_PROTOCOL_UNKNOWN + && ! good_l4_checksum1 + && ! (flags1 & IP_BUFFER_L4_CHECKSUM_COMPUTED))) + { + flags1 = ip6_tcp_udp_icmp_validate_checksum (vm, p1); + good_l4_checksum1 = + (flags1 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + } + + error0 = error1 = IP6_ERROR_UNKNOWN_PROTOCOL; + + error0 = len_diff0 < 0 ? IP6_ERROR_UDP_LENGTH : error0; + error1 = len_diff1 < 0 ? IP6_ERROR_UDP_LENGTH : error1; + + ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_UDP == IP6_ERROR_UDP_CHECKSUM); + ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_TCP == IP6_ERROR_TCP_CHECKSUM); + ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_ICMP == IP6_ERROR_ICMP_CHECKSUM); + error0 = (! good_l4_checksum0 + ? IP6_ERROR_UDP_CHECKSUM + type0 + : error0); + error1 = (! good_l4_checksum1 + ? IP6_ERROR_UDP_CHECKSUM + type1 + : error1); + + /* Drop packets from unroutable hosts. */ + /* If this is a neighbor solicitation (ICMP), skip source RPF check */ + if (error0 == IP6_ERROR_UNKNOWN_PROTOCOL && type0 != IP_BUILTIN_PROTOCOL_ICMP) + { + u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0); + error0 = (lm->miss_adj_index == src_adj_index0 + ? IP6_ERROR_SRC_LOOKUP_MISS + : error0); + } + if (error1 == IP6_ERROR_UNKNOWN_PROTOCOL && type1 != IP_BUILTIN_PROTOCOL_ICMP) + { + u32 src_adj_index1 = ip6_src_lookup_for_packet (im, p1, ip1); + error1 = (lm->miss_adj_index == src_adj_index1 + ? IP6_ERROR_SRC_LOOKUP_MISS + : error1); + } + + next0 = error0 != IP6_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0; + next1 = error1 != IP6_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next1; + + p0->error = error_node->errors[error0]; + p1->error = error_node->errors[error1]; + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip0; + udp_header_t * udp0; + u32 pi0, ip_len0, udp_len0, flags0, next0; + i32 len_diff0; + u8 error0, type0, good_l4_checksum0; + + pi0 = to_next[0] = from[0]; + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + + ip0 = vlib_buffer_get_current (p0); + + type0 = lm->builtin_protocol_by_ip_protocol[ip0->protocol]; + next0 = lm->local_next_by_ip_protocol[ip0->protocol]; + + flags0 = p0->flags; + + good_l4_checksum0 = (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + + udp0 = ip6_next_header (ip0); + + /* Don't verify UDP checksum for packets with explicit zero checksum. */ + good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UDP && udp0->checksum == 0; + + good_l4_checksum0 |= type0 == IP_BUILTIN_PROTOCOL_UNKNOWN; + + /* Verify UDP length. */ + ip_len0 = clib_net_to_host_u16 (ip0->payload_length); + udp_len0 = clib_net_to_host_u16 (udp0->length); + + len_diff0 = ip_len0 - udp_len0; + + len_diff0 = type0 == IP_BUILTIN_PROTOCOL_UDP ? len_diff0 : 0; + + if (PREDICT_FALSE (type0 != IP_BUILTIN_PROTOCOL_UNKNOWN + && ! good_l4_checksum0 + && ! (flags0 & IP_BUFFER_L4_CHECKSUM_COMPUTED))) + { + flags0 = ip6_tcp_udp_icmp_validate_checksum (vm, p0); + good_l4_checksum0 = + (flags0 & IP_BUFFER_L4_CHECKSUM_CORRECT) != 0; + } + + error0 = IP6_ERROR_UNKNOWN_PROTOCOL; + + error0 = len_diff0 < 0 ? IP6_ERROR_UDP_LENGTH : error0; + + ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_UDP == IP6_ERROR_UDP_CHECKSUM); + ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_TCP == IP6_ERROR_TCP_CHECKSUM); + ASSERT (IP6_ERROR_UDP_CHECKSUM + IP_BUILTIN_PROTOCOL_ICMP == IP6_ERROR_ICMP_CHECKSUM); + error0 = (! good_l4_checksum0 + ? IP6_ERROR_UDP_CHECKSUM + type0 + : error0); + + /* If this is a neighbor solicitation (ICMP), skip source RPF check */ + if (error0 == IP6_ERROR_UNKNOWN_PROTOCOL && type0 != IP_BUILTIN_PROTOCOL_ICMP) + { + u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0); + error0 = (lm->miss_adj_index == src_adj_index0 + ? IP6_ERROR_SRC_LOOKUP_MISS + : error0); + } + + next0 = error0 != IP6_ERROR_UNKNOWN_PROTOCOL ? IP_LOCAL_NEXT_DROP : next0; + + p0->error = error_node->errors[error0]; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip6_local_node,static) = { + .function = ip6_local, + .name = "ip6-local", + .vector_size = sizeof (u32), + + .format_trace = format_ip6_forward_next_trace, + + .n_next_nodes = IP_LOCAL_N_NEXT, + .next_nodes = { + [IP_LOCAL_NEXT_DROP] = "error-drop", + [IP_LOCAL_NEXT_PUNT] = "error-punt", + // [IP_LOCAL_NEXT_TCP_LOOKUP] = "ip6-tcp-lookup", + [IP_LOCAL_NEXT_UDP_LOOKUP] = "ip6-udp-lookup", + [IP_LOCAL_NEXT_ICMP] = "ip6-icmp-input", + }, +}; + +void ip6_register_protocol (u32 protocol, u32 node_index) +{ + vlib_main_t * vm = vlib_get_main(); + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + + ASSERT (protocol < ARRAY_LEN (lm->local_next_by_ip_protocol)); + lm->local_next_by_ip_protocol[protocol] = vlib_node_add_next (vm, ip6_local_node.index, node_index); +} + +typedef enum { + IP6_DISCOVER_NEIGHBOR_NEXT_DROP, + IP6_DISCOVER_NEIGHBOR_N_NEXT, +} ip6_discover_neighbor_next_t; + +typedef enum { + IP6_DISCOVER_NEIGHBOR_ERROR_DROP, + IP6_DISCOVER_NEIGHBOR_ERROR_REQUEST_SENT, +} ip6_discover_neighbor_error_t; + +static uword +ip6_discover_neighbor (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + u32 * from, * to_next_drop; + uword n_left_from, n_left_to_next_drop; + static f64 time_last_seed_change = -1e100; + static u32 hash_seeds[3]; + static uword hash_bitmap[256 / BITS (uword)]; + f64 time_now; + int bogus_length; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip6_forward_next_trace (vm, node, frame, VLIB_TX); + + time_now = vlib_time_now (vm); + if (time_now - time_last_seed_change > 1e-3) + { + uword i; + u32 * r = clib_random_buffer_get_data (&vm->random_buffer, + sizeof (hash_seeds)); + for (i = 0; i < ARRAY_LEN (hash_seeds); i++) + hash_seeds[i] = r[i]; + + /* Mark all hash keys as been not-seen before. */ + for (i = 0; i < ARRAY_LEN (hash_bitmap); i++) + hash_bitmap[i] = 0; + + time_last_seed_change = time_now; + } + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, IP6_DISCOVER_NEIGHBOR_NEXT_DROP, + to_next_drop, n_left_to_next_drop); + + while (n_left_from > 0 && n_left_to_next_drop > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip0; + u32 pi0, adj_index0, a0, b0, c0, m0, sw_if_index0, drop0; + uword bm0; + ip_adjacency_t * adj0; + vnet_hw_interface_t * hw_if0; + u32 next0; + + pi0 = from[0]; + + p0 = vlib_get_buffer (vm, pi0); + + adj_index0 = vnet_buffer (p0)->ip.adj_index[VLIB_TX]; + + ip0 = vlib_buffer_get_current (p0); + + adj0 = ip_get_adjacency (lm, adj_index0); + + a0 = hash_seeds[0]; + b0 = hash_seeds[1]; + c0 = hash_seeds[2]; + + sw_if_index0 = adj0->rewrite_header.sw_if_index; + vnet_buffer (p0)->sw_if_index[VLIB_TX] = sw_if_index0; + + a0 ^= sw_if_index0; + b0 ^= ip0->dst_address.as_u32[0]; + c0 ^= ip0->dst_address.as_u32[1]; + + hash_v3_mix32 (a0, b0, c0); + + b0 ^= ip0->dst_address.as_u32[2]; + c0 ^= ip0->dst_address.as_u32[3]; + + hash_v3_finalize32 (a0, b0, c0); + + c0 &= BITS (hash_bitmap) - 1; + c0 = c0 / BITS (uword); + m0 = (uword) 1 << (c0 % BITS (uword)); + + bm0 = hash_bitmap[c0]; + drop0 = (bm0 & m0) != 0; + + /* Mark it as seen. */ + hash_bitmap[c0] = bm0 | m0; + + from += 1; + n_left_from -= 1; + to_next_drop[0] = pi0; + to_next_drop += 1; + n_left_to_next_drop -= 1; + + hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index0); + + /* If the interface is link-down, drop the pkt */ + if (!(hw_if0->flags & VNET_HW_INTERFACE_FLAG_LINK_UP)) + drop0 = 1; + + p0->error = + node->errors[drop0 ? IP6_DISCOVER_NEIGHBOR_ERROR_DROP + : IP6_DISCOVER_NEIGHBOR_ERROR_REQUEST_SENT]; + if (drop0) + continue; + + { + u32 bi0 = 0; + icmp6_neighbor_solicitation_header_t * h0; + vlib_buffer_t * b0; + + h0 = vlib_packet_template_get_packet + (vm, &im->discover_neighbor_packet_template, &bi0); + + /* + * Build ethernet header. + * Choose source address based on destination lookup + * adjacency. + */ + ip6_src_address_for_packet (im, p0, &h0->ip.src_address, + sw_if_index0); + + /* + * Destination address is a solicited node multicast address. + * We need to fill in + * the low 24 bits with low 24 bits of target's address. + */ + h0->ip.dst_address.as_u8[13] = ip0->dst_address.as_u8[13]; + h0->ip.dst_address.as_u8[14] = ip0->dst_address.as_u8[14]; + h0->ip.dst_address.as_u8[15] = ip0->dst_address.as_u8[15]; + + h0->neighbor.target_address = ip0->dst_address; + + memcpy (h0->link_layer_option.ethernet_address, + hw_if0->hw_address, vec_len (hw_if0->hw_address)); + + /* $$$$ appears we need this; why is the checksum non-zero? */ + h0->neighbor.icmp.checksum = 0; + h0->neighbor.icmp.checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, 0, &h0->ip, + &bogus_length); + + ASSERT (bogus_length == 0); + + vlib_buffer_copy_trace_flag (vm, p0, bi0); + b0 = vlib_get_buffer (vm, bi0); + vnet_buffer (b0)->sw_if_index[VLIB_TX] + = vnet_buffer (p0)->sw_if_index[VLIB_TX]; + + /* Add rewrite/encap string. */ + vnet_rewrite_one_header (adj0[0], h0, + sizeof (ethernet_header_t)); + vlib_buffer_advance (b0, -adj0->rewrite_header.data_bytes); + + /* $$$$ hack in case next0 == 0 */ + b0->error = node->errors[IP6_DISCOVER_NEIGHBOR_ERROR_DROP]; + next0 = + vec_elt (im->discover_neighbor_next_index_by_hw_if_index, + hw_if0->hw_if_index); + + vlib_set_next_frame_buffer (vm, node, next0, bi0); + } + } + + vlib_put_next_frame (vm, node, IP6_DISCOVER_NEIGHBOR_NEXT_DROP, + n_left_to_next_drop); + } + + return frame->n_vectors; +} + +static char * ip6_discover_neighbor_error_strings[] = { + [IP6_DISCOVER_NEIGHBOR_ERROR_DROP] = "address overflow drops", + [IP6_DISCOVER_NEIGHBOR_ERROR_REQUEST_SENT] + = "neighbor solicitations sent", +}; + +VLIB_REGISTER_NODE (ip6_discover_neighbor_node) = { + .function = ip6_discover_neighbor, + .name = "ip6-discover-neighbor", + .vector_size = sizeof (u32), + + .format_trace = format_ip6_forward_next_trace, + + .n_errors = ARRAY_LEN (ip6_discover_neighbor_error_strings), + .error_strings = ip6_discover_neighbor_error_strings, + + .n_next_nodes = IP6_DISCOVER_NEIGHBOR_N_NEXT, + .next_nodes = { + [IP6_DISCOVER_NEIGHBOR_NEXT_DROP] = "error-drop", + }, +}; + +clib_error_t * +ip6_discover_neighbor_hw_interface_link_up_down (vnet_main_t * vnm, + u32 hw_if_index, + u32 flags) +{ + vlib_main_t * vm = vnm->vlib_main; + ip6_main_t * im = &ip6_main; + vnet_hw_interface_t * hw_if; + + hw_if = vnet_get_hw_interface (vnm, hw_if_index); + + vec_validate_init_empty + (im->discover_neighbor_next_index_by_hw_if_index, hw_if_index, 0); + im->discover_neighbor_next_index_by_hw_if_index[hw_if_index] + = vlib_node_add_next (vm, ip6_discover_neighbor_node.index, + hw_if->output_node_index); + return 0; +} + +VNET_HW_INTERFACE_LINK_UP_DOWN_FUNCTION +(ip6_discover_neighbor_hw_interface_link_up_down); + +clib_error_t * +ip6_probe_neighbor (vlib_main_t * vm, ip6_address_t * dst, u32 sw_if_index) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_main_t * im = &ip6_main; + icmp6_neighbor_solicitation_header_t * h; + ip6_address_t * src; + ip_interface_address_t * ia; + ip_adjacency_t * adj; + vnet_hw_interface_t * hi; + vnet_sw_interface_t * si; + vlib_buffer_t * b; + u32 bi = 0; + int bogus_length; + + si = vnet_get_sw_interface (vnm, sw_if_index); + + if (!(si->flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)) + { + return clib_error_return (0, "%U: interface %U down", + format_ip6_address, dst, + format_vnet_sw_if_index_name, vnm, + sw_if_index); + } + + src = ip6_interface_address_matching_destination (im, dst, sw_if_index, &ia); + if (! src) + { + vnm->api_errno = VNET_API_ERROR_NO_MATCHING_INTERFACE; + return clib_error_return + (0, "no matching interface address for destination %U (interface %U)", + format_ip6_address, dst, + format_vnet_sw_if_index_name, vnm, sw_if_index); + } + + h = vlib_packet_template_get_packet (vm, &im->discover_neighbor_packet_template, &bi); + + hi = vnet_get_sup_hw_interface (vnm, sw_if_index); + + /* Destination address is a solicited node multicast address. We need to fill in + the low 24 bits with low 24 bits of target's address. */ + h->ip.dst_address.as_u8[13] = dst->as_u8[13]; + h->ip.dst_address.as_u8[14] = dst->as_u8[14]; + h->ip.dst_address.as_u8[15] = dst->as_u8[15]; + + h->ip.src_address = src[0]; + h->neighbor.target_address = dst[0]; + + memcpy (h->link_layer_option.ethernet_address, hi->hw_address, vec_len (hi->hw_address)); + + h->neighbor.icmp.checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, 0, &h->ip, &bogus_length); + ASSERT(bogus_length == 0); + + b = vlib_get_buffer (vm, bi); + vnet_buffer (b)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_TX] = sw_if_index; + + /* Add encapsulation string for software interface (e.g. ethernet header). */ + adj = ip_get_adjacency (&im->lookup_main, ia->neighbor_probe_adj_index); + vnet_rewrite_one_header (adj[0], h, sizeof (ethernet_header_t)); + vlib_buffer_advance (b, -adj->rewrite_header.data_bytes); + + { + vlib_frame_t * f = vlib_get_frame_to_node (vm, hi->output_node_index); + u32 * to_next = vlib_frame_vector_args (f); + to_next[0] = bi; + f->n_vectors = 1; + vlib_put_frame_to_node (vm, hi->output_node_index, f); + } + + return /* no error */ 0; +} + +typedef enum { + IP6_REWRITE_NEXT_DROP, +} ip6_rewrite_next_t; + +always_inline uword +ip6_rewrite_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + int rewrite_for_locally_received_packets) +{ + ip_lookup_main_t * lm = &ip6_main.lookup_main; + u32 * from = vlib_frame_vector_args (frame); + u32 n_left_from, n_left_to_next, * to_next, next_index; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_input_node.index); + vlib_rx_or_tx_t adj_rx_tx = rewrite_for_locally_received_packets ? VLIB_RX : VLIB_TX; + + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + u32 cpu_index = os_get_cpu_number(); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + ip_adjacency_t * adj0, * adj1; + vlib_buffer_t * p0, * p1; + ip6_header_t * ip0, * ip1; + u32 pi0, rw_len0, next0, error0, adj_index0; + u32 pi1, rw_len1, next1, error1, adj_index1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->pre_data, 32, STORE); + CLIB_PREFETCH (p3->pre_data, 32, STORE); + + CLIB_PREFETCH (p2->data, sizeof (ip0[0]), STORE); + CLIB_PREFETCH (p3->data, sizeof (ip0[0]), STORE); + } + + pi0 = to_next[0] = from[0]; + pi1 = to_next[1] = from[1]; + + from += 2; + n_left_from -= 2; + to_next += 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx]; + adj_index1 = vnet_buffer (p1)->ip.adj_index[adj_rx_tx]; + + /* We should never rewrite a pkt using the MISS adjacency */ + ASSERT(adj_index0 && adj_index1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + error0 = error1 = IP6_ERROR_NONE; + + if (! rewrite_for_locally_received_packets) + { + i32 hop_limit0 = ip0->hop_limit, hop_limit1 = ip1->hop_limit; + + /* Input node should have reject packets with hop limit 0. */ + ASSERT (ip0->hop_limit > 0); + ASSERT (ip1->hop_limit > 0); + + hop_limit0 -= 1; + hop_limit1 -= 1; + + ip0->hop_limit = hop_limit0; + ip1->hop_limit = hop_limit1; + + error0 = hop_limit0 <= 0 ? IP6_ERROR_TIME_EXPIRED : error0; + error1 = hop_limit1 <= 0 ? IP6_ERROR_TIME_EXPIRED : error1; + } + + adj0 = ip_get_adjacency (lm, adj_index0); + adj1 = ip_get_adjacency (lm, adj_index1); + + if (rewrite_for_locally_received_packets) + { + /* + * If someone sends e.g. an icmp6 w/ src = dst = interface addr, + * we end up here with a local adjacency in hand + */ + if (PREDICT_FALSE(adj0->lookup_next_index + == IP_LOOKUP_NEXT_LOCAL)) + error0 = IP6_ERROR_SPOOFED_LOCAL_PACKETS; + if (PREDICT_FALSE(adj1->lookup_next_index + == IP_LOOKUP_NEXT_LOCAL)) + error1 = IP6_ERROR_SPOOFED_LOCAL_PACKETS; + } + + rw_len0 = adj0[0].rewrite_header.data_bytes; + rw_len1 = adj1[0].rewrite_header.data_bytes; + + vlib_increment_combined_counter (&lm->adjacency_counters, + cpu_index, + adj_index0, + /* packet increment */ 0, + /* byte increment */ rw_len0); + vlib_increment_combined_counter (&lm->adjacency_counters, + cpu_index, + adj_index1, + /* packet increment */ 0, + /* byte increment */ rw_len1); + + /* Check MTU of outgoing interface. */ + error0 = (vlib_buffer_length_in_chain (vm, p0) > adj0[0].rewrite_header.max_l3_packet_bytes + ? IP6_ERROR_MTU_EXCEEDED + : error0); + error1 = (vlib_buffer_length_in_chain (vm, p1) > adj1[0].rewrite_header.max_l3_packet_bytes + ? IP6_ERROR_MTU_EXCEEDED + : error1); + + p0->current_data -= rw_len0; + p1->current_data -= rw_len1; + + p0->current_length += rw_len0; + p1->current_length += rw_len1; + + vnet_buffer (p0)->sw_if_index[VLIB_TX] = adj0[0].rewrite_header.sw_if_index; + vnet_buffer (p1)->sw_if_index[VLIB_TX] = adj1[0].rewrite_header.sw_if_index; + + next0 = (error0 == IP6_ERROR_NONE) ? + adj0[0].rewrite_header.next_index : IP6_REWRITE_NEXT_DROP; + next1 = (error1 == IP6_ERROR_NONE) ? + adj1[0].rewrite_header.next_index : IP6_REWRITE_NEXT_DROP; + + /* Guess we are only writing on simple Ethernet header. */ + vnet_rewrite_two_headers (adj0[0], adj1[0], + ip0, ip1, + sizeof (ethernet_header_t)); + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + ip_adjacency_t * adj0; + vlib_buffer_t * p0; + ip6_header_t * ip0; + u32 pi0, rw_len0; + u32 adj_index0, next0, error0; + + pi0 = to_next[0] = from[0]; + + p0 = vlib_get_buffer (vm, pi0); + + adj_index0 = vnet_buffer (p0)->ip.adj_index[adj_rx_tx]; + + /* We should never rewrite a pkt using the MISS adjacency */ + ASSERT(adj_index0); + + adj0 = ip_get_adjacency (lm, adj_index0); + + ip0 = vlib_buffer_get_current (p0); + + error0 = IP6_ERROR_NONE; + + /* Check hop limit */ + if (! rewrite_for_locally_received_packets) + { + i32 hop_limit0 = ip0->hop_limit; + + ASSERT (ip0->hop_limit > 0); + + hop_limit0 -= 1; + + ip0->hop_limit = hop_limit0; + + error0 = hop_limit0 <= 0 ? IP6_ERROR_TIME_EXPIRED : error0; + } + + if (rewrite_for_locally_received_packets) + { + if (PREDICT_FALSE(adj0->lookup_next_index + == IP_LOOKUP_NEXT_LOCAL)) + error0 = IP6_ERROR_SPOOFED_LOCAL_PACKETS; + } + + /* Guess we are only writing on simple Ethernet header. */ + vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); + + /* Update packet buffer attributes/set output interface. */ + rw_len0 = adj0[0].rewrite_header.data_bytes; + + vlib_increment_combined_counter (&lm->adjacency_counters, + cpu_index, + adj_index0, + /* packet increment */ 0, + /* byte increment */ rw_len0); + + /* Check MTU of outgoing interface. */ + error0 = (vlib_buffer_length_in_chain (vm, p0) > adj0[0].rewrite_header.max_l3_packet_bytes + ? IP6_ERROR_MTU_EXCEEDED + : error0); + + p0->current_data -= rw_len0; + p0->current_length += rw_len0; + vnet_buffer (p0)->sw_if_index[VLIB_TX] = adj0[0].rewrite_header.sw_if_index; + + next0 = (error0 == IP6_ERROR_NONE) ? + adj0[0].rewrite_header.next_index : IP6_REWRITE_NEXT_DROP; + + p0->error = error_node->errors[error0]; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* Need to do trace after rewrites to pick up new packet data. */ + if (node->flags & VLIB_NODE_FLAG_TRACE) + ip6_forward_next_trace (vm, node, frame, adj_rx_tx); + + return frame->n_vectors; +} + +static uword +ip6_rewrite_transit (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip6_rewrite_inline (vm, node, frame, + /* rewrite_for_locally_received_packets */ 0); +} + +static uword +ip6_rewrite_local (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip6_rewrite_inline (vm, node, frame, + /* rewrite_for_locally_received_packets */ 1); +} + +VLIB_REGISTER_NODE (ip6_rewrite_node) = { + .function = ip6_rewrite_transit, + .name = "ip6-rewrite", + .vector_size = sizeof (u32), + + .format_trace = format_ip6_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [IP6_REWRITE_NEXT_DROP] = "error-drop", + }, +}; + +VLIB_REGISTER_NODE (ip6_rewrite_local_node,static) = { + .function = ip6_rewrite_local, + .name = "ip6-rewrite-local", + .vector_size = sizeof (u32), + + .sibling_of = "ip6-rewrite", + + .format_trace = format_ip6_forward_next_trace, + + .n_next_nodes = 1, + .next_nodes = { + [IP6_REWRITE_NEXT_DROP] = "error-drop", + }, +}; + +/* Global IP6 main. */ +ip6_main_t ip6_main; + +static clib_error_t * +ip6_lookup_init (vlib_main_t * vm) +{ + ip6_main_t * im = &ip6_main; + uword i; + + for (i = 0; i < ARRAY_LEN (im->fib_masks); i++) + { + u32 j, i0, i1; + + i0 = i / 32; + i1 = i % 32; + + for (j = 0; j < i0; j++) + im->fib_masks[i].as_u32[j] = ~0; + + if (i1) + im->fib_masks[i].as_u32[i0] = clib_host_to_net_u32 (pow2_mask (i1) << (32 - i1)); + } + + ip_lookup_init (&im->lookup_main, /* is_ip6 */ 1); + + if (im->lookup_table_nbuckets == 0) + im->lookup_table_nbuckets = IP6_FIB_DEFAULT_HASH_NUM_BUCKETS; + + im->lookup_table_nbuckets = 1<< max_log2 (im->lookup_table_nbuckets); + + if (im->lookup_table_size == 0) + im->lookup_table_size = IP6_FIB_DEFAULT_HASH_MEMORY_SIZE; + + BV(clib_bihash_init) (&im->ip6_lookup_table, "ip6 lookup table", + im->lookup_table_nbuckets, + im->lookup_table_size); + + /* Create FIB with index 0 and table id of 0. */ + find_ip6_fib_by_table_index_or_id (im, /* table id */ 0, IP6_ROUTE_FLAG_TABLE_ID); + + { + pg_node_t * pn; + pn = pg_get_node (ip6_lookup_node.index); + pn->unformat_edit = unformat_pg_ip6_header; + } + + { + icmp6_neighbor_solicitation_header_t p; + + memset (&p, 0, sizeof (p)); + + p.ip.ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6 << 28); + p.ip.payload_length = clib_host_to_net_u16 (sizeof (p) + - STRUCT_OFFSET_OF (icmp6_neighbor_solicitation_header_t, neighbor)); + p.ip.protocol = IP_PROTOCOL_ICMP6; + p.ip.hop_limit = 255; + ip6_set_solicited_node_multicast_address (&p.ip.dst_address, 0); + + p.neighbor.icmp.type = ICMP6_neighbor_solicitation; + + p.link_layer_option.header.type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_source_link_layer_address; + p.link_layer_option.header.n_data_u64s = sizeof (p.link_layer_option) / sizeof (u64); + + vlib_packet_template_init (vm, + &im->discover_neighbor_packet_template, + &p, sizeof (p), + /* alloc chunk size */ 8, + "ip6 neighbor discovery"); + } + + return 0; +} + +VLIB_INIT_FUNCTION (ip6_lookup_init); + +static clib_error_t * +add_del_ip6_interface_table (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * error = 0; + u32 sw_if_index, table_id; + + sw_if_index = ~0; + + if (! unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + error = clib_error_return (0, "unknown interface `%U'", + format_unformat_error, input); + goto done; + } + + if (unformat (input, "%d", &table_id)) + ; + else + { + error = clib_error_return (0, "expected table id `%U'", + format_unformat_error, input); + goto done; + } + + { + ip6_main_t * im = &ip6_main; + ip6_fib_t * fib = + find_ip6_fib_by_table_index_or_id (im, table_id, IP6_ROUTE_FLAG_TABLE_ID); + + if (fib) + { + vec_validate (im->fib_index_by_sw_if_index, sw_if_index); + im->fib_index_by_sw_if_index[sw_if_index] = fib->index; + } + } + + done: + return error; +} + +VLIB_CLI_COMMAND (set_interface_ip_table_command, static) = { + .path = "set interface ip6 table", + .function = add_del_ip6_interface_table, + .short_help = "set interface ip6 table <intfc> <table-id>" +}; + +void +ip6_link_local_address_from_ethernet_mac_address (ip6_address_t *ip, + u8 *mac) +{ + ip->as_u64[0] = clib_host_to_net_u64 (0xFE80000000000000ULL); + /* Invert the "u" bit */ + ip->as_u8 [8] = mac[0] ^ (1<<1); + ip->as_u8 [9] = mac[1]; + ip->as_u8 [10] = mac[2]; + ip->as_u8 [11] = 0xFF; + ip->as_u8 [12] = 0xFE; + ip->as_u8 [13] = mac[3]; + ip->as_u8 [14] = mac[4]; + ip->as_u8 [15] = mac[5]; +} + +void +ip6_ethernet_mac_address_from_link_local_address (u8 *mac, + ip6_address_t *ip) +{ + /* Invert the previously inverted "u" bit */ + mac[0] = ip->as_u8 [8] ^ (1<<1); + mac[1] = ip->as_u8 [9]; + mac[2] = ip->as_u8 [10]; + mac[3] = ip->as_u8 [13]; + mac[4] = ip->as_u8 [14]; + mac[5] = ip->as_u8 [15]; +} + +static clib_error_t * +test_ip6_link_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u8 mac[6]; + ip6_address_t _a, *a = &_a; + + if (unformat (input, "%U", unformat_ethernet_address, mac)) + { + ip6_link_local_address_from_ethernet_mac_address (a, mac); + vlib_cli_output (vm, "Link local address: %U", + format_ip6_address, a); + ip6_ethernet_mac_address_from_link_local_address (mac, a); + vlib_cli_output (vm, "Original MAC address: %U", + format_ethernet_address, mac); + } + + return 0; +} + +VLIB_CLI_COMMAND (test_link_command, static) = { + .path = "test ip6 link", + .function = test_ip6_link_command_fn, + .short_help = "test ip6 link <mac-address>", +}; + +int vnet_set_ip6_flow_hash (u32 table_id, u32 flow_hash_config) +{ + ip6_main_t * im6 = &ip6_main; + ip6_fib_t * fib; + uword * p = hash_get (im6->fib_index_by_table_id, table_id); + + if (p == 0) + return -1; + + fib = vec_elt_at_index (im6->fibs, p[0]); + + fib->flow_hash_config = flow_hash_config; + return 1; +} + +static clib_error_t * +set_ip6_flow_hash_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + int matched = 0; + u32 table_id = 0; + u32 flow_hash_config = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { + if (unformat (input, "table %d", &table_id)) + matched = 1; +#define _(a,v) \ + else if (unformat (input, #a)) { flow_hash_config |= v; matched=1;} + foreach_flow_hash_bit +#undef _ + else break; + } + + if (matched == 0) + return clib_error_return (0, "unknown input `%U'", + format_unformat_error, input); + + rv = vnet_set_ip6_flow_hash (table_id, flow_hash_config); + switch (rv) + { + case 1: + break; + + case -1: + return clib_error_return (0, "no such FIB table %d", table_id); + + default: + clib_warning ("BUG: illegal flow hash config 0x%x", flow_hash_config); + break; + } + + return 0; +} + +VLIB_CLI_COMMAND (set_ip6_flow_hash_command, static) = { + .path = "set ip6 flow-hash", + .short_help = + "set ip table flow-hash table <fib-id> src dst sport dport proto reverse", + .function = set_ip6_flow_hash_command_fn, +}; + +static clib_error_t * +show_ip6_local_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + int i; + + vlib_cli_output (vm, "Protocols handled by ip6_local"); + for (i = 0; i < ARRAY_LEN(lm->local_next_by_ip_protocol); i++) + { + if (lm->local_next_by_ip_protocol[i] != IP_LOCAL_NEXT_PUNT) + vlib_cli_output (vm, "%d", i); + } + return 0; +} + + + +VLIB_CLI_COMMAND (show_ip_local, static) = { + .path = "show ip6 local", + .function = show_ip6_local_command_fn, + .short_help = "Show ip6 local protocol table", +}; + +int vnet_set_ip6_classify_intfc (vlib_main_t * vm, u32 sw_if_index, + u32 table_index) +{ + vnet_main_t * vnm = vnet_get_main(); + vnet_interface_main_t * im = &vnm->interface_main; + ip6_main_t * ipm = &ip6_main; + ip_lookup_main_t * lm = &ipm->lookup_main; + vnet_classify_main_t * cm = &vnet_classify_main; + + if (pool_is_free_index (im->sw_interfaces, sw_if_index)) + return VNET_API_ERROR_NO_MATCHING_INTERFACE; + + if (table_index != ~0 && pool_is_free_index (cm->tables, table_index)) + return VNET_API_ERROR_NO_SUCH_ENTRY; + + vec_validate (lm->classify_table_index_by_sw_if_index, sw_if_index); + lm->classify_table_index_by_sw_if_index [sw_if_index] = table_index; + + return 0; +} + +static clib_error_t * +set_ip6_classify_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + u32 table_index = ~0; + int table_index_set = 0; + u32 sw_if_index = ~0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { + if (unformat (input, "table-index %d", &table_index)) + table_index_set = 1; + else if (unformat (input, "intfc %U", unformat_vnet_sw_interface, + vnet_get_main(), &sw_if_index)) + ; + else + break; + } + + if (table_index_set == 0) + return clib_error_return (0, "classify table-index must be specified"); + + if (sw_if_index == ~0) + return clib_error_return (0, "interface / subif must be specified"); + + rv = vnet_set_ip6_classify_intfc (vm, sw_if_index, table_index); + + switch (rv) + { + case 0: + break; + + case VNET_API_ERROR_NO_MATCHING_INTERFACE: + return clib_error_return (0, "No such interface"); + + case VNET_API_ERROR_NO_SUCH_ENTRY: + return clib_error_return (0, "No such classifier table"); + } + return 0; +} + +VLIB_CLI_COMMAND (set_ip6_classify_command, static) = { + .path = "set ip6 classify", + .short_help = + "set ip6 classify intfc <int> table-index <index>", + .function = set_ip6_classify_command_fn, +}; + +static clib_error_t * +ip6_config (vlib_main_t * vm, unformat_input_t * input) +{ + ip6_main_t * im = &ip6_main; + uword heapsize = 0; + u32 tmp; + u32 nbuckets = 0; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) { + if (unformat (input, "hash-buckets %d", &tmp)) + nbuckets = tmp; + else if (unformat (input, "heap-size %dm", &tmp)) + heapsize = ((u64)tmp) << 20; + else if (unformat (input, "heap-size %dM", &tmp)) + heapsize = ((u64)tmp) << 20; + else if (unformat (input, "heap-size %dg", &tmp)) + heapsize = ((u64)tmp) << 30; + else if (unformat (input, "heap-size %dG", &tmp)) + heapsize = ((u64)tmp) << 30; + else + return clib_error_return (0, "unknown input '%U'", + format_unformat_error, input); + } + + im->lookup_table_nbuckets = nbuckets; + im->lookup_table_size = heapsize; + + return 0; +} + +VLIB_EARLY_CONFIG_FUNCTION (ip6_config, "ip6"); + diff --git a/vnet/vnet/ip/ip6_hop_by_hop.c b/vnet/vnet/ip/ip6_hop_by_hop.c new file mode 100644 index 00000000000..64edfd249c3 --- /dev/null +++ b/vnet/vnet/ip/ip6_hop_by_hop.c @@ -0,0 +1,1139 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vlib/vlib.h> +#include <vnet/vnet.h> +#include <vnet/pg/pg.h> +#include <vppinfra/error.h> + +#include <vnet/ip/ip.h> + +#include <vppinfra/hash.h> +#include <vppinfra/error.h> +#include <vppinfra/elog.h> + +#include <vnet/ip/ip6_hop_by_hop.h> + +ip6_hop_by_hop_main_t ip6_hop_by_hop_main; + +/* + * ip6 hop-by-hop option handling. We push pkts with h-b-h options to + * ip6_hop_by_hop_node_fn from ip6-lookup at a cost of ~2 clocks/pkt in + * the speed path. + * + * We parse through the h-b-h option TLVs, specifically looking for + * HBH_OPTION_TYPE_IOAM_DATA_LIST. [Someone needs to get bananas from + * IANA, aka to actually allocate the option TLV codes.] + * + * If we find the indicated option type, and we have remaining list + * elements in the trace list, allocate and populate the trace list + * element. + * + * At the ingress edge: punch in the h-b-h rewrite, then visit the + * standard h-b-h option handler. We have to be careful in the standard + * h-b-h handler, to avoid looping until we run out of rewrite space. + * Ask me how I know that. + * + * Remaining work: + * decide on egress point "pop and count" scheme + * time stamp handling: usec since the top of the hour? + * configure the node id + * trace list application data support + * cons up analysis / steering plug-in(s) + * add configuration binary APIs, vpe_api_test_support, yang models and + * orca code + * perf tune: dual loop, replace memcpy w/ N x 8-byte load/stores + * + */ + +/* + * primary h-b-h handler trace support + * We work pretty hard on the problem for obvious reasons + */ +typedef struct { + u32 next_index; + u32 trace_len; + u8 option_data[256]; +} ip6_hop_by_hop_trace_t; + +static u8 * format_ioam_data_list_element (u8 * s, va_list * args) +{ + ioam_data_list_element_t *elt = va_arg (*args, ioam_data_list_element_t *); + u32 ttl_node_id_host_byte_order = + clib_net_to_host_u32 (elt->ttl_node_id); + + s = format (s, "ttl %d node id %d ingress %d egress %d ts %u", + ttl_node_id_host_byte_order>>24, + ttl_node_id_host_byte_order & 0x00FFFFFF, + elt->ingress_if, + elt->egress_if, + elt->timestamp); + return s; +} + +static u8 * format_ip6_hop_by_hop_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip6_hop_by_hop_trace_t * t = va_arg (*args, ip6_hop_by_hop_trace_t *); + ip6_hop_by_hop_header_t *hbh0; + ip6_hop_by_hop_option_t *opt0, *limit0; + ioam_trace_option_t * trace0; + ioam_data_list_element_t * elt0; + int elt_index; + u8 type0; + + hbh0 = (ip6_hop_by_hop_header_t *)t->option_data; + + s = format (s, "IP6_HOP_BY_HOP: next index %d len %d traced %d\n", + t->next_index, (hbh0->length+1)<<3, t->trace_len); + + opt0 = (ip6_hop_by_hop_option_t *) (hbh0+1); + limit0 = (ip6_hop_by_hop_option_t *) ((u8 *)hbh0) + t->trace_len; + + while (opt0 < limit0) + { + type0 = opt0->type & HBH_OPTION_TYPE_MASK; + elt_index = 0; + switch (type0) + { + case HBH_OPTION_TYPE_IOAM_DATA_LIST: + trace0 = (ioam_trace_option_t *)opt0; + s = format (s, " Trace %d elts left\n", + trace0->data_list_elts_left); + elt0 = &trace0->elts[0]; + while ((u8 *) elt0 < + ((u8 *)(&trace0->elts[0]) + trace0->hdr.length - 1 + /* -1 accounts for elts_left */)) + { + s = format (s, " [%d] %U\n",elt_index, + format_ioam_data_list_element, elt0); + elt_index++; + elt0++; + } + + opt0 = (ip6_hop_by_hop_option_t *) + (((u8 *)opt0) + opt0->length + + sizeof (ip6_hop_by_hop_option_t)); + break; + + case HBH_OPTION_TYPE_IOAM_PROOF_OF_WORK: + s = format (s, " POW opt present\n"); + opt0 = (ip6_hop_by_hop_option_t *) + (((u8 *)opt0) + sizeof (ioam_pow_option_t)); + break; + + case 0: /* Pad, just stop */ + opt0 = (ip6_hop_by_hop_option_t *) ((u8 *)opt0) + 1; + break; + + default: + s = format (s, "Unknown %d", type0); + opt0 = (ip6_hop_by_hop_option_t *) + (((u8 *)opt0) + opt0->length + + sizeof (ip6_hop_by_hop_option_t)); + break; + } + } + return s; +} + +vlib_node_registration_t ip6_hop_by_hop_node; + +#define foreach_ip6_hop_by_hop_error \ +_(PROCESSED, "Pkts with ip6 hop-by-hop options") + +typedef enum { +#define _(sym,str) IP6_HOP_BY_HOP_ERROR_##sym, + foreach_ip6_hop_by_hop_error +#undef _ + IP6_HOP_BY_HOP_N_ERROR, +} ip6_hop_by_hop_error_t; + +static char * ip6_hop_by_hop_error_strings[] = { +#define _(sym,string) string, + foreach_ip6_hop_by_hop_error +#undef _ +}; + +static uword +ip6_hop_by_hop_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main; + u32 n_left_from, * from, * to_next; + ip_lookup_next_t next_index; + u32 processed = 0; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + +#if 0 /* $$$ DUAL-LOOP ME */ + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 next0 = IP6_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT; + u32 next1 = IP6_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT; + u32 sw_if_index0, sw_if_index1; + u8 tmp0[6], tmp1[6]; + ethernet_header_t *en0, *en1; + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* $$$$$ Dual loop: process 2 x packets here $$$$$ */ + ASSERT (b0->current_data == 0); + ASSERT (b1->current_data == 0); + + ip0 = vlib_buffer_get_current (b0); + ip1 = vlib_buffer_get_current (b0); + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; + + /* $$$$$ End of processing 2 x packets $$$$$ */ + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + ip6_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + ip6_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->sw_if_index = sw_if_index1; + t->next_index = next1; + } + } + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } +#endif + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + u32 adj_index0; + ip6_header_t * ip0; + ip_adjacency_t * adj0; + ip6_hop_by_hop_header_t *hbh0; + ip6_hop_by_hop_option_t *opt0, *limit0; + ioam_trace_option_t * trace0; + ioam_data_list_element_t * elt0; + u8 type0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ip0 = vlib_buffer_get_current (b0); + adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; + adj0 = ip_get_adjacency (lm, adj_index0); + hbh0 = (ip6_hop_by_hop_header_t *)(ip0+1); + opt0 = (ip6_hop_by_hop_option_t *)(hbh0+1); + limit0 = (ip6_hop_by_hop_option_t *) + ((u8 *)hbh0 + ((hbh0->length+1)<<3)); + + /* Scan the set of h-b-h options, process ones that we understand */ + while (opt0 < limit0) + { + type0 = opt0->type & HBH_OPTION_TYPE_MASK; + switch (type0) + { + case HBH_OPTION_TYPE_IOAM_DATA_LIST: + trace0 = (ioam_trace_option_t *)opt0; + if (PREDICT_TRUE (trace0->data_list_elts_left)) + { + trace0->data_list_elts_left--; + elt0 = &trace0->elts[trace0->data_list_elts_left]; + elt0->ttl_node_id = + clib_host_to_net_u32 ((ip0->hop_limit<<24) + | hm->node_id); + elt0->ingress_if = + vnet_buffer(b0)->sw_if_index[VLIB_RX]; + elt0->egress_if = adj0->rewrite_header.sw_if_index; + elt0->timestamp = 123; /* $$$$ */ + /* $$$ set elt0->app_data */ + } + + opt0 = (ip6_hop_by_hop_option_t *) + (((u8 *)opt0) + opt0->length + + sizeof (ip6_hop_by_hop_option_t)); + break; + + case HBH_OPTION_TYPE_IOAM_PROOF_OF_WORK: + opt0 = (ip6_hop_by_hop_option_t *) + (((u8 *)opt0) + sizeof (ioam_pow_option_t)); + break; + + case 0: /* Pad */ + opt0 = (ip6_hop_by_hop_option_t *) ((u8 *)opt0) + 1; + goto out0; + } + } + + out0: + + /* + * Since we push pkts here from the h-b-h header imposition code + * we have to be careful what we wish for... + */ + next0 = adj0->lookup_next_index != IP_LOOKUP_NEXT_ADD_HOP_BY_HOP ? + adj0->lookup_next_index : adj0->saved_lookup_next_index; + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + ip6_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + u32 trace_len = (hbh0->length+1)<<3; + t->next_index = next0; + /* Capture the h-b-h option verbatim */ + trace_len = trace_len < ARRAY_LEN(t->option_data) ? + trace_len : ARRAY_LEN(t->option_data); + t->trace_len = trace_len; + memcpy (t->option_data, hbh0, trace_len); + } + + processed++; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, ip6_hop_by_hop_node.index, + IP6_HOP_BY_HOP_ERROR_PROCESSED, processed); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip6_hop_by_hop_node) = { + .function = ip6_hop_by_hop_node_fn, + .name = "ip6-hop-by-hop", + .vector_size = sizeof (u32), + .format_trace = format_ip6_hop_by_hop_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(ip6_hop_by_hop_error_strings), + .error_strings = ip6_hop_by_hop_error_strings, + + /* See ip/lookup.h */ + .n_next_nodes = IP_LOOKUP_N_NEXT, + .next_nodes = { + [IP_LOOKUP_NEXT_MISS] = "ip6-miss", + [IP_LOOKUP_NEXT_DROP] = "ip6-drop", + [IP_LOOKUP_NEXT_PUNT] = "ip6-punt", + [IP_LOOKUP_NEXT_LOCAL] = "ip6-local", + [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor", + [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite", + [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify", + [IP_LOOKUP_NEXT_MAP] = "ip6-map", + [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t", + [IP_LOOKUP_NEXT_SIXRD] = "ip6-sixrd", + /* Next 3 arcs probably never used */ + [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop", + [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop", + [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip6-pop-hop-by-hop", + }, +}; + +/* The main h-b-h tracer will be invoked, no need to do much here */ +typedef struct { + u32 next_index; +} ip6_add_hop_by_hop_trace_t; + +/* packet trace format function */ +static u8 * format_ip6_add_hop_by_hop_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip6_add_hop_by_hop_trace_t * t = va_arg (*args, + ip6_add_hop_by_hop_trace_t *); + + s = format (s, "IP6_ADD_HOP_BY_HOP: next index %d", + t->next_index); + return s; +} + +vlib_node_registration_t ip6_add_hop_by_hop_node; + +#define foreach_ip6_add_hop_by_hop_error \ +_(PROCESSED, "Pkts w/ added ip6 hop-by-hop options") + +typedef enum { +#define _(sym,str) IP6_ADD_HOP_BY_HOP_ERROR_##sym, + foreach_ip6_add_hop_by_hop_error +#undef _ + IP6_ADD_HOP_BY_HOP_N_ERROR, +} ip6_add_hop_by_hop_error_t; + +static char * ip6_add_hop_by_hop_error_strings[] = { +#define _(sym,string) string, + foreach_ip6_add_hop_by_hop_error +#undef _ +}; + +static uword +ip6_add_hop_by_hop_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main; + u32 n_left_from, * from, * to_next; + ip_lookup_next_t next_index; + u32 processed = 0; + u8 * rewrite = hm->rewrite; + u32 rewrite_length = vec_len (rewrite); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + +#if 0 + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 next0 = IP6_ADD_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT; + u32 next1 = IP6_ADD_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT; + u32 sw_if_index0, sw_if_index1; + u8 tmp0[6], tmp1[6]; + ethernet_header_t *en0, *en1; + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* $$$$$ Dual loop: process 2 x packets here $$$$$ */ + ASSERT (b0->current_data == 0); + ASSERT (b1->current_data == 0); + + ip0 = vlib_buffer_get_current (b0); + ip1 = vlib_buffer_get_current (b0); + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; + + /* $$$$$ End of processing 2 x packets $$$$$ */ + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + ip6_add_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + ip6_add_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->sw_if_index = sw_if_index1; + t->next_index = next1; + } + } + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } +#endif + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + ip6_header_t * ip0; + ip6_hop_by_hop_header_t * hbh0; + u64 * copy_src0, * copy_dst0; + u16 new_l0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ip0 = vlib_buffer_get_current (b0); + + /* Copy the ip header left by the required amount */ + copy_dst0 = (u64 *)(((u8 *)ip0) - rewrite_length); + copy_src0 = (u64 *) ip0; + + copy_dst0 [0] = copy_src0 [0]; + copy_dst0 [1] = copy_src0 [1]; + copy_dst0 [2] = copy_src0 [2]; + copy_dst0 [3] = copy_src0 [3]; + copy_dst0 [4] = copy_src0 [4]; + vlib_buffer_advance (b0, - (word)rewrite_length); + ip0 = vlib_buffer_get_current (b0); + + hbh0 = (ip6_hop_by_hop_header_t *)(ip0 + 1); + /* $$$ tune, rewrite_length is a multiple of 8 */ + memcpy (hbh0, rewrite, rewrite_length); + /* Patch the protocol chain, insert the h-b-h (type 0) header */ + hbh0->protocol = ip0->protocol; + ip0->protocol = 0; + new_l0 = clib_net_to_host_u16 (ip0->payload_length) + rewrite_length; + ip0->payload_length = clib_host_to_net_u16 (new_l0); + + /* Populate the (first) h-b-h list elt */ + next0 = IP_LOOKUP_NEXT_HOP_BY_HOP; + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + ip6_add_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + } + + processed++; + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, ip6_add_hop_by_hop_node.index, + IP6_ADD_HOP_BY_HOP_ERROR_PROCESSED, processed); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip6_add_hop_by_hop_node) = { + .function = ip6_add_hop_by_hop_node_fn, + .name = "ip6-add-hop-by-hop", + .vector_size = sizeof (u32), + .format_trace = format_ip6_add_hop_by_hop_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(ip6_add_hop_by_hop_error_strings), + .error_strings = ip6_add_hop_by_hop_error_strings, + + /* See ip/lookup.h */ + .n_next_nodes = IP_LOOKUP_N_NEXT, + .next_nodes = { + [IP_LOOKUP_NEXT_MISS] = "ip6-miss", + [IP_LOOKUP_NEXT_DROP] = "ip6-drop", + [IP_LOOKUP_NEXT_PUNT] = "ip6-punt", + [IP_LOOKUP_NEXT_LOCAL] = "ip6-local", + [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor", + [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite", + [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify", + [IP_LOOKUP_NEXT_MAP] = "ip6-map", + [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t", + [IP_LOOKUP_NEXT_SIXRD] = "ip6-sixrd", + /* Next 3 arcs probably never used */ + [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop", + [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop", + [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip6-pop-hop-by-hop", + }, +}; + + +/* The main h-b-h tracer was already invoked, no need to do much here */ +typedef struct { + u32 next_index; +} ip6_pop_hop_by_hop_trace_t; + +/* packet trace format function */ +static u8 * format_ip6_pop_hop_by_hop_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip6_pop_hop_by_hop_trace_t * t = va_arg (*args, ip6_pop_hop_by_hop_trace_t *); + + s = format (s, "IP6_POP_HOP_BY_HOP: next index %d", + t->next_index); + return s; +} + +vlib_node_registration_t ip6_pop_hop_by_hop_node; + +#define foreach_ip6_pop_hop_by_hop_error \ +_(PROCESSED, "Pkts w/ removed ip6 hop-by-hop options") \ +_(NO_HOHO, "Pkts w/ no ip6 hop-by-hop options") + +typedef enum { +#define _(sym,str) IP6_POP_HOP_BY_HOP_ERROR_##sym, + foreach_ip6_pop_hop_by_hop_error +#undef _ + IP6_POP_HOP_BY_HOP_N_ERROR, +} ip6_pop_hop_by_hop_error_t; + +static char * ip6_pop_hop_by_hop_error_strings[] = { +#define _(sym,string) string, + foreach_ip6_pop_hop_by_hop_error +#undef _ +}; + +static uword +ip6_pop_hop_by_hop_node_fn (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main; + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + u32 n_left_from, * from, * to_next; + ip_lookup_next_t next_index; + u32 processed = 0; + u32 no_header = 0; + u32 (*ioam_end_of_path_cb) (vlib_main_t *, vlib_node_runtime_t *, + vlib_buffer_t *, ip6_header_t *, + ip_adjacency_t *); + + ioam_end_of_path_cb = hm->ioam_end_of_path_cb; + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + +#if 0 + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 next0 = IP6_POP_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT; + u32 next1 = IP6_POP_HOP_BY_HOP_NEXT_INTERFACE_OUTPUT; + u32 sw_if_index0, sw_if_index1; + u8 tmp0[6], tmp1[6]; + ethernet_header_t *en0, *en1; + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + CLIB_PREFETCH (p3->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + /* speculatively enqueue b0 and b1 to the current next frame */ + to_next[0] = bi0 = from[0]; + to_next[1] = bi1 = from[1]; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* $$$$$ Dual loop: process 2 x packets here $$$$$ */ + ASSERT (b0->current_data == 0); + ASSERT (b1->current_data == 0); + + ip0 = vlib_buffer_get_current (b0); + ip1 = vlib_buffer_get_current (b0); + + sw_if_index0 = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer(b1)->sw_if_index[VLIB_RX]; + + /* $$$$$ End of processing 2 x packets $$$$$ */ + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE))) + { + if (b0->flags & VLIB_BUFFER_IS_TRACED) + { + ip6_pop_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = sw_if_index0; + t->next_index = next0; + } + if (b1->flags & VLIB_BUFFER_IS_TRACED) + { + ip6_pop_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b1, sizeof (*t)); + t->sw_if_index = sw_if_index1; + t->next_index = next1; + } + } + + /* verify speculative enqueues, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } +#endif + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0; + u32 adj_index0; + ip6_header_t * ip0; + ip_adjacency_t * adj0; + ip6_hop_by_hop_header_t *hbh0; + u64 * copy_dst0, * copy_src0; + u16 new_l0; + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + ip0 = vlib_buffer_get_current (b0); + adj_index0 = vnet_buffer (b0)->ip.adj_index[VLIB_TX]; + adj0 = ip_get_adjacency (lm, adj_index0); + + /* Perfectly normal to end up here w/ out h-b-h header */ + if (PREDICT_TRUE (ip0->protocol == 0)) + { + hbh0 = (ip6_hop_by_hop_header_t *)(ip0+1); + + /* Collect data from trace via callback */ + next0 = ioam_end_of_path_cb ? + ioam_end_of_path_cb (vm, node, b0, ip0, adj0) + : adj0->saved_lookup_next_index; + + + /* Pop the trace data */ + vlib_buffer_advance (b0, (hbh0->length+1)<<3); + new_l0 = clib_net_to_host_u16 (ip0->payload_length) - + ((hbh0->length+1)<<3); + ip0->payload_length = clib_host_to_net_u16 (new_l0); + ip0->protocol = hbh0->protocol; + copy_src0 = (u64 *)ip0; + copy_dst0 = copy_src0 + (hbh0->length+1); + copy_dst0 [4] = copy_src0[4]; + copy_dst0 [3] = copy_src0[3]; + copy_dst0 [2] = copy_src0[2]; + copy_dst0 [1] = copy_src0[1]; + copy_dst0 [0] = copy_src0[0]; + processed++; + } + else + { + next0 = adj0->saved_lookup_next_index; + no_header++; + } + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + ip6_pop_hop_by_hop_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->next_index = next0; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, ip6_pop_hop_by_hop_node.index, + IP6_POP_HOP_BY_HOP_ERROR_PROCESSED, processed); + vlib_node_increment_counter (vm, ip6_pop_hop_by_hop_node.index, + IP6_POP_HOP_BY_HOP_ERROR_NO_HOHO, no_header); + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip6_pop_hop_by_hop_node) = { + .function = ip6_pop_hop_by_hop_node_fn, + .name = "ip6-pop-hop-by-hop", + .vector_size = sizeof (u32), + .format_trace = format_ip6_pop_hop_by_hop_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = ARRAY_LEN(ip6_pop_hop_by_hop_error_strings), + .error_strings = ip6_pop_hop_by_hop_error_strings, + + /* See ip/lookup.h */ + .n_next_nodes = IP_LOOKUP_N_NEXT, + .next_nodes = { + [IP_LOOKUP_NEXT_MISS] = "ip6-miss", + [IP_LOOKUP_NEXT_DROP] = "ip6-drop", + [IP_LOOKUP_NEXT_PUNT] = "ip6-punt", + [IP_LOOKUP_NEXT_LOCAL] = "ip6-local", + [IP_LOOKUP_NEXT_ARP] = "ip6-discover-neighbor", + [IP_LOOKUP_NEXT_REWRITE] = "ip6-rewrite", + [IP_LOOKUP_NEXT_CLASSIFY] = "ip6-classify", + [IP_LOOKUP_NEXT_MAP] = "ip6-map", + [IP_LOOKUP_NEXT_MAP_T] = "ip6-map-t", + [IP_LOOKUP_NEXT_SIXRD] = "ip6-sixrd", + /* Next 3 arcs probably never used */ + [IP_LOOKUP_NEXT_HOP_BY_HOP] = "ip6-hop-by-hop", + [IP_LOOKUP_NEXT_ADD_HOP_BY_HOP] = "ip6-add-hop-by-hop", + [IP_LOOKUP_NEXT_POP_HOP_BY_HOP] = "ip6-pop-hop-by-hop", + }, +}; + + +static clib_error_t * +ip6_hop_by_hop_init (vlib_main_t * vm) +{ + ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main; + + hm->vlib_main = vm; + hm->vnet_main = vnet_get_main(); + + return 0; +} + +VLIB_INIT_FUNCTION (ip6_hop_by_hop_init); + +int ip6_ioam_set_rewrite (u8 **rwp, u32 trace_option_elts, int has_pow_option) +{ + u8 *rewrite = 0; + u32 size, rnd_size; + ip6_hop_by_hop_header_t *hbh; + ioam_trace_option_t * trace_option; + ioam_pow_option_t * pow_option; + u8 *current; + + vec_free (*rwp); + + if (trace_option_elts == 0 && has_pow_option == 0) + return 0; + + if (trace_option_elts * sizeof (ioam_data_list_element_t) > 254) + return VNET_API_ERROR_INVALID_VALUE; + + /* Work out how much space we need */ + size = sizeof (ip6_hop_by_hop_header_t); + + if (trace_option_elts) + { + size += sizeof (ip6_hop_by_hop_option_t); + size += trace_option_elts * (sizeof (ioam_data_list_element_t)); + } + if (has_pow_option) + { + size += sizeof (ip6_hop_by_hop_option_t); + size += sizeof (ioam_pow_option_t); + } + + /* Round to a multiple of 8 octets */ + rnd_size = (size + 7) & ~7; + + /* allocate it, zero-fill / pad by construction */ + vec_validate (rewrite, rnd_size-1); + + hbh = (ip6_hop_by_hop_header_t *) rewrite; + /* Length of header in 8 octet units, not incl first 8 octets */ + hbh->length = (rnd_size>>3) - 1; + current = (u8 *)(hbh+1); + + if (trace_option_elts) + { + trace_option = (ioam_trace_option_t *)current; + trace_option->hdr.type = HBH_OPTION_TYPE_IOAM_DATA_LIST + | HBH_OPTION_TYPE_DATA_CHANGE_ENROUTE; + trace_option->hdr.length = 1 /*data_list_elts_left */ + + trace_option_elts * sizeof (ioam_data_list_element_t); + trace_option->data_list_elts_left = trace_option_elts; + current += sizeof (ioam_trace_option_t) + + trace_option_elts * sizeof (ioam_data_list_element_t); + } + if (has_pow_option) + { + pow_option = (ioam_pow_option_t *)current; + pow_option->hdr.type = HBH_OPTION_TYPE_IOAM_PROOF_OF_WORK + | HBH_OPTION_TYPE_DATA_CHANGE_ENROUTE; + pow_option->hdr.length = sizeof (ioam_pow_option_t) - + sizeof (ip6_hop_by_hop_option_t); + current += sizeof (ioam_pow_option_t); + } + + *rwp = rewrite; + return 0; +} + +static clib_error_t * +ip6_ioam_set_rewrite_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ip6_hop_by_hop_main_t *hm = &ip6_hop_by_hop_main; + u32 trace_option_elts = 0; + int has_pow_option = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "trace-elts %d", &trace_option_elts)) + ; + else if (unformat (input, "pow")) + has_pow_option = 1; + else + break; + } + + rv = ip6_ioam_set_rewrite (&hm->rewrite, trace_option_elts, has_pow_option); + + switch (rv) + { + case 0: + break; + default: + return clib_error_return (0, "ip6_ioam_set_rewrite returned %d", rv); + } + + return 0; +} + +VLIB_CLI_COMMAND (ip6_ioam_set_rewrite_cmd, static) = { + .path = "ioam set rewrite", + .short_help = "ioam set rewrite [trace-elts <nn>] [pow]", + .function = ip6_ioam_set_rewrite_command_fn, +}; + +int ip6_ioam_set_destination (ip6_address_t *addr, u32 mask_width, u32 vrf_id, + int is_add, int is_pop, int is_none) +{ + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + ip_adjacency_t * adj; + u32 fib_index; + u32 len, adj_index; + int i, rv; + uword * p; + BVT(clib_bihash_kv) kv, value; + + if ((is_add + is_pop + is_none) != 1) + return VNET_API_ERROR_INVALID_VALUE_2; + + /* Go find the adjacency we're supposed to tickle */ + p = hash_get (im->fib_index_by_table_id, vrf_id); + + if (p == 0) + return VNET_API_ERROR_NO_SUCH_FIB; + + fib_index = p[0]; + + len = vec_len (im->prefix_lengths_in_search_order); + + for (i = 0; i < len; i++) + { + int dst_address_length = im->prefix_lengths_in_search_order[i]; + ip6_address_t * mask = &im->fib_masks[dst_address_length]; + + if (dst_address_length != mask_width) + continue; + + kv.key[0] = addr->as_u64[0] & mask->as_u64[0]; + kv.key[1] = addr->as_u64[1] & mask->as_u64[1]; + kv.key[2] = ((u64)((fib_index))<<32) | dst_address_length; + + rv = BV(clib_bihash_search_inline_2)(&im->ip6_lookup_table, &kv, &value); + if (rv == 0) + goto found; + + } + return VNET_API_ERROR_NO_SUCH_ENTRY; + + found: + + /* Got it, modify as directed... */ + adj_index = value.value; + adj = ip_get_adjacency (lm, adj_index); + + /* Restore original lookup-next action */ + if (adj->saved_lookup_next_index) + { + adj->lookup_next_index = adj->saved_lookup_next_index; + adj->saved_lookup_next_index = 0; + } + + /* Save current action */ + if (is_add || is_pop) + adj->saved_lookup_next_index = adj->lookup_next_index; + + if (is_add) + adj->lookup_next_index = IP_LOOKUP_NEXT_ADD_HOP_BY_HOP; + + if (is_pop) + adj->lookup_next_index = IP_LOOKUP_NEXT_POP_HOP_BY_HOP; + + return 0; +} + +static clib_error_t * +ip6_ioam_set_destination_command_fn (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + ip6_address_t addr; + u32 mask_width = ~0; + int is_add = 0; + int is_pop = 0; + int is_none = 0; + u32 vrf_id = 0; + int rv; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "%U/%d", + unformat_ip6_address, &addr, &mask_width)) + ; + else if (unformat (input, "vrf-id %d", &vrf_id)) + ; + else if (unformat (input, "add")) + is_add = 1; + else if (unformat (input, "pop")) + is_pop = 1; + else if (unformat (input, "none")) + is_none = 1; + else + break; + } + + if ((is_add + is_pop + is_none) != 1) + return clib_error_return (0, "One of (add, pop, none) required"); + if (mask_width == ~0) + return clib_error_return (0, "<address>/<mask-width> required"); + + rv = ip6_ioam_set_destination (&addr, mask_width, vrf_id, + is_add, is_pop, is_none); + + switch (rv) + { + case 0: + break; + default: + return clib_error_return (0, "ip6_ioam_set_destination returned %d", rv); + } + + return 0; +} + +VLIB_CLI_COMMAND (ip6_ioam_set_destination_cmd, static) = { + .path = "ioam set destination", + .short_help = "ioam set destination <ip6-address>/<width> add | pop | none", + .function = ip6_ioam_set_destination_command_fn, +}; + +void vnet_register_ioam_end_of_path_callback (void *cb) +{ + ip6_hop_by_hop_main_t * hm = &ip6_hop_by_hop_main; + + hm->ioam_end_of_path_cb = cb; +} + diff --git a/vnet/vnet/ip/ip6_hop_by_hop.h b/vnet/vnet/ip/ip6_hop_by_hop.h new file mode 100644 index 00000000000..82bafc5777b --- /dev/null +++ b/vnet/vnet/ip/ip6_hop_by_hop.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_ip6_hop_by_hop_h__ +#define __included_ip6_hop_by_hop_h__ + +#include <vnet/ip/ip6_hop_by_hop_packet.h> + +typedef struct { + /* The current rewrite we're using */ + u8 * rewrite; + + /* Trace data processing callback */ + void *ioam_end_of_path_cb; + + /* Configured node-id */ + u32 node_id; + + /* convenience */ + vlib_main_t * vlib_main; + vnet_main_t * vnet_main; +} ip6_hop_by_hop_main_t; + +#endif /* __included_ip6_hop_by_hop_h__ */ diff --git a/vnet/vnet/ip/ip6_hop_by_hop_packet.h b/vnet/vnet/ip/ip6_hop_by_hop_packet.h new file mode 100644 index 00000000000..a3d19035dae --- /dev/null +++ b/vnet/vnet/ip/ip6_hop_by_hop_packet.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __included_ip6_hop_by_hop_packet_h__ +#define __included_ip6_hop_by_hop_packet_h__ + +typedef struct { + /* Protocol for next header */ + u8 protocol; + /* + * Length of hop_by_hop header in 8 octet units, + * not including the first 8 octets + */ + u8 length; +} ip6_hop_by_hop_header_t; + +typedef struct { + /* Option Type */ +#define HBH_OPTION_TYPE_SKIP_UNKNOWN (0x0 << 6) +#define HBH_OPTION_TYPE_DISCARD_UNKNOWN (0x1 << 6) +#define HBH_OPTION_TYPE_DISCARD_UNKNOWN_ICMP (0x2 << 6) +#define HBH_OPTION_TYPE_DISCARD_UNKNOWN_ICMP_NOT_MCAST (0x3 << 6) +#define HBH_OPTION_TYPE_DATA_CHANGE_ENROUTE (1<<5) +#define HBH_OPTION_TYPE_MASK (0x1F) + u8 type; + /* Length in octets of the option data field */ + u8 length; +} ip6_hop_by_hop_option_t; + +/* $$$$ IANA banana constants */ +#define HBH_OPTION_TYPE_IOAM_DATA_LIST 1 +#define HBH_OPTION_TYPE_IOAM_PROOF_OF_WORK 2 + +typedef struct { + u32 ttl_node_id; + u16 ingress_if; + u16 egress_if; + u32 timestamp; + u32 app_data; +} ioam_data_list_element_t; + +typedef CLIB_PACKED(struct { + ip6_hop_by_hop_option_t hdr; + u8 data_list_elts_left; + ioam_data_list_element_t elts[0]; +}) ioam_trace_option_t; + +typedef CLIB_PACKED(struct { + ip6_hop_by_hop_option_t hdr; + u8 pow_type; + u8 reserved; + u32 random[2]; + u32 cumulative[2]; +}) ioam_pow_option_t; + +#endif /* __included_ip6_hop_by_hop_packet_h__ */ diff --git a/vnet/vnet/ip/ip6_input.c b/vnet/vnet/ip/ip6_input.c new file mode 100644 index 00000000000..ef8c7762625 --- /dev/null +++ b/vnet/vnet/ip/ip6_input.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip6_input.c: IP v6 input node + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vnet/ppp/ppp.h> +#include <vnet/hdlc/hdlc.h> + +typedef struct { + u8 packet_data[64]; +} ip6_input_trace_t; + +static u8 * format_ip6_input_trace (u8 * s, va_list * va) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*va, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*va, vlib_node_t *); + ip6_input_trace_t * t = va_arg (*va, ip6_input_trace_t *); + + s = format (s, "%U", + format_ip6_header, + t->packet_data, sizeof (t->packet_data)); + + return s; +} + +typedef enum { + IP6_INPUT_NEXT_DROP, + IP6_INPUT_NEXT_LOOKUP, + IP6_INPUT_NEXT_TTL_EXPIRE, + IP6_INPUT_N_NEXT, +} ip6_input_next_t; + +/* Validate IP v6 packets and pass them either to forwarding code + or drop exception packets. */ +static uword +ip6_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + u32 n_left_from, * from, * to_next; + ip6_input_next_t next_index; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_input_node.index); + vlib_simple_counter_main_t * cm; + u32 cpu_index = os_get_cpu_number(); + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (ip6_input_trace_t)); + + cm = vec_elt_at_index (vnm->interface_main.sw_if_counters, + VNET_INTERFACE_COUNTER_IP6); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + vlib_buffer_t * p0, * p1; + ip6_header_t * ip0, * ip1; + ip_config_main_t * cm0, * cm1; + u32 pi0, sw_if_index0, next0; + u32 pi1, sw_if_index1, next1; + u8 error0, error1, cast0, cast1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, sizeof (ip0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (ip1[0]), LOAD); + } + + pi0 = from[0]; + pi1 = from[1]; + + to_next[0] = pi0; + to_next[1] = pi1; + from += 2; + to_next += 2; + n_left_from -= 2; + n_left_to_next -= 2; + + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + + ip0 = vlib_buffer_get_current (p0); + ip1 = vlib_buffer_get_current (p1); + + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + sw_if_index1 = vnet_buffer (p1)->sw_if_index[VLIB_RX]; + + cast0 = ip6_address_is_multicast (&ip0->dst_address) ? VNET_MULTICAST : VNET_UNICAST; + cast1 = ip6_address_is_multicast (&ip1->dst_address) ? VNET_MULTICAST : VNET_UNICAST; + + cm0 = lm->rx_config_mains + cast0; + cm1 = lm->rx_config_mains + cast1; + + vnet_buffer (p0)->ip.current_config_index = vec_elt (cm0->config_index_by_sw_if_index, sw_if_index0); + vnet_buffer (p1)->ip.current_config_index = vec_elt (cm1->config_index_by_sw_if_index, sw_if_index1); + + vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; + vnet_buffer (p1)->ip.adj_index[VLIB_RX] = ~0; + + vnet_get_config_data (&cm0->config_main, + &vnet_buffer (p0)->ip.current_config_index, + &next0, + /* # bytes of config data */ 0); + vnet_get_config_data (&cm1->config_main, + &vnet_buffer (p1)->ip.current_config_index, + &next1, + /* # bytes of config data */ 0); + + vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + vlib_increment_simple_counter (cm, cpu_index, sw_if_index1, 1); + + error0 = error1 = IP6_ERROR_NONE; + + /* Version != 6? Drop it. */ + error0 = (clib_net_to_host_u32 (ip0->ip_version_traffic_class_and_flow_label) >> 28) != 6 ? IP6_ERROR_VERSION : error0; + error1 = (clib_net_to_host_u32 (ip1->ip_version_traffic_class_and_flow_label) >> 28) != 6 ? IP6_ERROR_VERSION : error1; + + /* hop limit < 1? Drop it. for link-local broadcast packets, + * like dhcpv6 packets from client has hop-limit 1, which should not + * be dropped. + */ + error0 = ip0->hop_limit <= 1 ? IP6_ERROR_TIME_EXPIRED : error0; + error1 = ip1->hop_limit <= 1 ? IP6_ERROR_TIME_EXPIRED : error1; + + /* L2 length must be at least minimal IP header. */ + error0 = p0->current_length < sizeof (ip0[0]) ? IP6_ERROR_TOO_SHORT : error0; + error1 = p1->current_length < sizeof (ip1[0]) ? IP6_ERROR_TOO_SHORT : error1; + + if (PREDICT_FALSE(error0 != IP6_ERROR_NONE)) + { + next0 = (error0 == IP6_ERROR_TIME_EXPIRED) ? + IP6_INPUT_NEXT_TTL_EXPIRE : IP6_INPUT_NEXT_DROP; + } + if (PREDICT_FALSE(error1 != IP6_ERROR_NONE)) + { + next1 = (error1 == IP6_ERROR_TIME_EXPIRED) ? + IP6_INPUT_NEXT_TTL_EXPIRE : IP6_INPUT_NEXT_DROP; + } + + p0->error = error_node->errors[error0]; + p1->error = error_node->errors[error1]; + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + pi0, pi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip0; + ip_config_main_t * cm0; + u32 pi0, sw_if_index0, next0; + u8 error0, cast0; + + pi0 = from[0]; + to_next[0] = pi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, pi0); + ip0 = vlib_buffer_get_current (p0); + + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + cast0 = ip6_address_is_multicast (&ip0->dst_address) ? VNET_MULTICAST : VNET_UNICAST; + cm0 = lm->rx_config_mains + cast0; + vnet_buffer (p0)->ip.current_config_index = vec_elt (cm0->config_index_by_sw_if_index, sw_if_index0); + vnet_buffer (p0)->ip.adj_index[VLIB_RX] = ~0; + + vnet_get_config_data (&cm0->config_main, + &vnet_buffer (p0)->ip.current_config_index, + &next0, + /* # bytes of config data */ 0); + + vlib_increment_simple_counter (cm, cpu_index, sw_if_index0, 1); + error0 = IP6_ERROR_NONE; + + /* Version != 6? Drop it. */ + error0 = (clib_net_to_host_u32 (ip0->ip_version_traffic_class_and_flow_label) >> 28) != 6 ? IP6_ERROR_VERSION : error0; + + /* hop limit < 1? Drop it. for link-local broadcast packets, + * like dhcpv6 packets from client has hop-limit 1, which should not + * be dropped. + */ + error0 = ip0->hop_limit <= 1 ? IP6_ERROR_TIME_EXPIRED : error0; + + /* L2 length must be at least minimal IP header. */ + error0 = p0->current_length < sizeof (ip0[0]) ? IP6_ERROR_TOO_SHORT : error0; + + if (PREDICT_FALSE(error0 != IP6_ERROR_NONE)) + { + next0 = (error0 == IP6_ERROR_TIME_EXPIRED) ? + IP6_INPUT_NEXT_TTL_EXPIRE : IP6_INPUT_NEXT_DROP; + } + p0->error = error_node->errors[error0]; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + pi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + return frame->n_vectors; +} + +static char * ip6_error_strings[] = { +#define _(sym,string) string, + foreach_ip6_error +#undef _ +}; + +VLIB_REGISTER_NODE (ip6_input_node) = { + .function = ip6_input, + .name = "ip6-input", + .vector_size = sizeof (u32), + + .n_errors = IP6_N_ERROR, + .error_strings = ip6_error_strings, + + .n_next_nodes = IP6_INPUT_N_NEXT, + .next_nodes = { + [IP6_INPUT_NEXT_DROP] = "error-drop", + [IP6_INPUT_NEXT_LOOKUP] = "ip6-lookup", + [IP6_INPUT_NEXT_TTL_EXPIRE] = "ip6-icmp-ttl-expire", + }, + + .format_buffer = format_ip6_header, + .format_trace = format_ip6_input_trace, +}; + +static clib_error_t * ip6_init (vlib_main_t * vm) +{ + ethernet_register_input_type (vm, ETHERNET_TYPE_IP6, + ip6_input_node.index); + ppp_register_input_protocol (vm, PPP_PROTOCOL_ip6, + ip6_input_node.index); + hdlc_register_input_protocol (vm, HDLC_PROTOCOL_ip6, + ip6_input_node.index); + + { + pg_node_t * pn; + pn = pg_get_node (ip6_input_node.index); + pn->unformat_edit = unformat_pg_ip6_header; + } + + /* Set flow hash to something non-zero. */ + ip6_main.flow_hash_seed = 0xdeadbeef; + + /* Default hop limit for packets we generate. */ + ip6_main.host_config.ttl = 64; + + return /* no error */ 0; +} + +VLIB_INIT_FUNCTION (ip6_init); diff --git a/vnet/vnet/ip/ip6_neighbor.c b/vnet/vnet/ip/ip6_neighbor.c new file mode 100644 index 00000000000..28f964c804f --- /dev/null +++ b/vnet/vnet/ip/ip6_neighbor.c @@ -0,0 +1,3146 @@ +/* + * ip/ip6_neighbor.c: IP6 neighbor handling + * + * Copyright (c) 2010 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vnet/ip/ip.h> +#include <vnet/ethernet/ethernet.h> +#include <vppinfra/mhash.h> +#include <vppinfra/md5.h> + +#if DPDK==1 +#include <vnet/devices/dpdk/dpdk.h> +#endif + +typedef struct { + ip6_address_t ip6_address; + u32 sw_if_index; + u32 pad; +} ip6_neighbor_key_t; + +/* can't use sizeof link_layer_address, that's 8 */ +#define ETHER_MAC_ADDR_LEN 6 + +typedef struct { + ip6_neighbor_key_t key; + u8 link_layer_address[8]; + u64 cpu_time_last_updated; +} ip6_neighbor_t; + +/* advertised prefix option */ +typedef struct { + /* basic advertised information */ + ip6_address_t prefix; + u8 prefix_len; + int adv_on_link_flag; + int adv_autonomous_flag; + u32 adv_valid_lifetime_in_secs; + u32 adv_pref_lifetime_in_secs; + + /* advertised values are computed from these times if decrementing */ + f64 valid_lifetime_expires; + f64 pref_lifetime_expires; + + /* local information */ + int enabled; + int deprecated_prefix_flag; + int decrement_lifetime_flag; + +#define MIN_ADV_VALID_LIFETIME 7203 /* seconds */ +#define DEF_ADV_VALID_LIFETIME 2592000 +#define DEF_ADV_PREF_LIFETIME 604800 + + /* extensions are added here, mobile, DNS etc.. */ +} ip6_radv_prefix_t; + + +typedef struct { + /* group information */ + u8 type; + ip6_address_t mcast_address; + u16 num_sources; + ip6_address_t *mcast_source_address_pool; +} ip6_mldp_group_t; + +/* configured router advertisement information per ipv6 interface */ +typedef struct { + + /* advertised config information, zero means unspecified */ + u8 curr_hop_limit; + int adv_managed_flag; + int adv_other_flag; + u16 adv_router_lifetime_in_sec; + u32 adv_neighbor_reachable_time_in_msec; + u32 adv_time_in_msec_between_retransmitted_neighbor_solicitations; + + /* mtu option */ + u32 adv_link_mtu; + + /* source link layer option */ + u8 link_layer_address[8]; + u8 link_layer_addr_len; + + /* prefix option */ + ip6_radv_prefix_t * adv_prefixes_pool; + + /* Hash table mapping address to index in interface advertised prefix pool. */ + mhash_t address_to_prefix_index; + + /* MLDP group information */ + ip6_mldp_group_t * mldp_group_pool; + + /* Hash table mapping address to index in mldp address pool. */ + mhash_t address_to_mldp_index; + + /* local information */ + u32 sw_if_index; + u32 fib_index; + int send_radv; /* radv on/off on this interface - set by config */ + int cease_radv; /* we are ceasing to send - set byf config */ + int send_unicast; + int adv_link_layer_address; + int prefix_option; + int failed_device_check; + int all_routers_mcast; + u32 seed; + u64 randomizer; + int ref_count; + u32 all_nodes_adj_index; + u32 all_routers_adj_index; + u32 all_mldv2_routers_adj_index; + + /* timing information */ +#define DEF_MAX_RADV_INTERVAL 200 +#define DEF_MIN_RADV_INTERVAL .75 * DEF_MAX_RADV_INTERVAL +#define DEF_CURR_HOP_LIMIT 64 +#define DEF_DEF_RTR_LIFETIME 3 * DEF_MAX_RADV_INTERVAL +#define MAX_DEF_RTR_LIFETIME 9000 + +#define MAX_INITIAL_RTR_ADVERT_INTERVAL 16 /* seconds */ +#define MAX_INITIAL_RTR_ADVERTISEMENTS 3 /*transmissions */ +#define MIN_DELAY_BETWEEN_RAS 3 /* seconds */ +#define MAX_DELAY_BETWEEN_RAS 1800 /* seconds */ +#define MAX_RA_DELAY_TIME .5 /* seconds */ + + f64 max_radv_interval; + f64 min_radv_interval; + f64 min_delay_between_radv; + f64 max_delay_between_radv; + f64 max_rtr_default_lifetime; + + f64 last_radv_time; + f64 last_multicast_time; + f64 next_multicast_time; + + + u32 initial_adverts_count; + f64 initial_adverts_interval; + u32 initial_adverts_sent; + + /* stats */ + u32 n_advertisements_sent; + u32 n_solicitations_rcvd; + u32 n_solicitations_dropped; + + /* Link local address to use (defaults to underlying physical for logical interfaces */ + ip6_address_t link_local_address; + u8 link_local_prefix_len; + +} ip6_radv_t; + +typedef struct { + u32 next_index; + uword node_index; + uword type_opaque; + uword data; +} pending_resolution_t; + + +typedef struct { + /* Hash tables mapping name to opcode. */ + uword * opcode_by_name; + + /* lite beer "glean" adjacency handling */ + mhash_t pending_resolutions_by_address; + pending_resolution_t * pending_resolutions; + + u32 * neighbor_input_next_index_by_hw_if_index; + + ip6_neighbor_t * neighbor_pool; + + mhash_t neighbor_index_by_key; + + u32 * if_radv_pool_index_by_sw_if_index; + + ip6_radv_t * if_radv_pool; + + /* Neighbor attack mitigation */ + u32 limit_neighbor_cache_size; + u32 neighbor_delete_rotor; + +} ip6_neighbor_main_t; + +static ip6_neighbor_main_t ip6_neighbor_main; + +static u8 * format_ip6_neighbor_ip6_entry (u8 * s, va_list * va) +{ + vlib_main_t * vm = va_arg (*va, vlib_main_t *); + ip6_neighbor_t * n = va_arg (*va, ip6_neighbor_t *); + vnet_main_t * vnm = vnet_get_main(); + vnet_sw_interface_t * si; + + if (! n) + return format (s, "%=12s%=20s%=20s%=40s", "Time", "Address", "Link layer", "Interface"); + + si = vnet_get_sw_interface (vnm, n->key.sw_if_index); + s = format (s, "%=12U%=20U%=20U%=40U", + format_vlib_cpu_time, vm, n->cpu_time_last_updated, + format_ip6_address, &n->key.ip6_address, + format_ethernet_address, n->link_layer_address, + format_vnet_sw_interface_name, vnm, si); + + return s; +} + +static clib_error_t * +ip6_neighbor_sw_interface_up_down (vnet_main_t * vnm, + u32 sw_if_index, + u32 flags) +{ + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + ip6_neighbor_t * n; + + if (! (flags & VNET_SW_INTERFACE_FLAG_ADMIN_UP)) + { + u32 i, * to_delete = 0; + + pool_foreach (n, nm->neighbor_pool, ({ + if (n->key.sw_if_index == sw_if_index) + vec_add1 (to_delete, n - nm->neighbor_pool); + })); + + for (i = 0; i < vec_len (to_delete); i++) + { + n = pool_elt_at_index (nm->neighbor_pool, to_delete[i]); + mhash_unset (&nm->neighbor_index_by_key, &n->key, 0); + pool_put (nm->neighbor_pool, n); + } + + vec_free (to_delete); + } + + return 0; +} + +VNET_SW_INTERFACE_ADMIN_UP_DOWN_FUNCTION (ip6_neighbor_sw_interface_up_down); + +static void unset_random_neighbor_entry (void) +{ + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + vnet_main_t * vnm = vnet_get_main(); + vlib_main_t * vm = vnm->vlib_main; + ip6_neighbor_t * e; + u32 index; + + index = pool_next_index (nm->neighbor_pool, nm->neighbor_delete_rotor); + nm->neighbor_delete_rotor = index; + + /* Try again from elt 0, could happen if an intfc goes down */ + if (index == ~0) + { + index = pool_next_index (nm->neighbor_pool, nm->neighbor_delete_rotor); + nm->neighbor_delete_rotor = index; + } + + /* Nothing left in the pool */ + if (index == ~0) + return; + + e = pool_elt_at_index (nm->neighbor_pool, index); + + vnet_unset_ip6_ethernet_neighbor (vm, e->key.sw_if_index, + &e->key.ip6_address, + e->link_layer_address, + ETHER_MAC_ADDR_LEN); +} + +typedef struct { + u8 is_add; + u8 pad; + u8 link_layer_address[6]; + u32 sw_if_index; + ip6_address_t addr; +} ip6_neighbor_set_unset_rpc_args_t; + +static void ip6_neighbor_set_unset_rpc_callback +( ip6_neighbor_set_unset_rpc_args_t * a); + +#if DPDK > 0 +static void set_unset_ip6_neighbor_rpc +(vlib_main_t * vm, + u32 sw_if_index, + ip6_address_t * a, + u8 *link_layer_addreess, + int is_add) +{ + ip6_neighbor_set_unset_rpc_args_t args; + + args.sw_if_index = sw_if_index; + args.is_add = is_add; + memcpy (&args.addr, a, sizeof (*a)); + memcpy (args.link_layer_address, link_layer_addreess, 6); + + vl_api_rpc_call_main_thread (ip6_neighbor_set_unset_rpc_callback, + (u8 *) &args, sizeof (args)); +} +#endif + +int +vnet_set_ip6_ethernet_neighbor (vlib_main_t * vm, + u32 sw_if_index, + ip6_address_t * a, + u8 * link_layer_address, + uword n_bytes_link_layer_address) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + ip6_neighbor_key_t k; + ip6_neighbor_t * n; + ip6_main_t * im = &ip6_main; + uword * p; + u32 next_index; + pending_resolution_t * pr; + +#if DPDK > 0 + if (os_get_cpu_number()) + { + set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, + 1 /* set new neighbor */); + return 0; + } +#endif + + k.sw_if_index = sw_if_index; + k.ip6_address = a[0]; + k.pad = 0; + + vlib_worker_thread_barrier_sync (vm); + + p = mhash_get (&nm->neighbor_index_by_key, &k); + if (p) + n = pool_elt_at_index (nm->neighbor_pool, p[0]); + else + { + ip6_add_del_route_args_t args; + ip_adjacency_t adj; + + memset (&adj, 0, sizeof(adj)); + adj.lookup_next_index = IP_LOOKUP_NEXT_REWRITE; + adj.explicit_fib_index = ~0; + + vnet_rewrite_for_sw_interface + (vnm, + VNET_L3_PACKET_TYPE_IP6, + sw_if_index, + ip6_rewrite_node.index, + link_layer_address, + &adj.rewrite_header, + sizeof (adj.rewrite_data)); + + args.table_index_or_table_id = im->fib_index_by_sw_if_index[sw_if_index]; + args.flags = IP6_ROUTE_FLAG_FIB_INDEX | IP6_ROUTE_FLAG_ADD | IP6_ROUTE_FLAG_NEIGHBOR; + args.dst_address = a[0]; + args.dst_address_length = 128; + args.adj_index = ~0; + args.add_adj = &adj; + args.n_add_adj = 1; + + ip6_add_del_route (im, &args); + pool_get (nm->neighbor_pool, n); + mhash_set (&nm->neighbor_index_by_key, &k, n - nm->neighbor_pool, + /* old value */ 0); + n->key = k; + } + + /* Update time stamp and ethernet address. */ + memcpy (n->link_layer_address, link_layer_address, n_bytes_link_layer_address); + n->cpu_time_last_updated = clib_cpu_time_now (); + + /* Customer(s) waiting for this address to be resolved? */ + p = mhash_get (&nm->pending_resolutions_by_address, a); + if (p == 0) + goto out; + + next_index = p[0]; + + while (next_index != (u32)~0) + { + pr = pool_elt_at_index (nm->pending_resolutions, next_index); + vlib_process_signal_event (vm, pr->node_index, + pr->type_opaque, + pr->data); + next_index = pr->next_index; + pool_put (nm->pending_resolutions, pr); + } + + mhash_unset (&nm->pending_resolutions_by_address, a, 0); + +out: + vlib_worker_thread_barrier_release(vm); + return 0; +} + +int +vnet_unset_ip6_ethernet_neighbor (vlib_main_t * vm, + u32 sw_if_index, + ip6_address_t * a, + u8 * link_layer_address, + uword n_bytes_link_layer_address) +{ + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + ip6_neighbor_key_t k; + ip6_neighbor_t * n; + ip6_main_t * im = &ip6_main; + ip6_add_del_route_args_t args; + uword * p; + int rv = 0; + +#if DPDK > 0 + if (os_get_cpu_number()) + { + set_unset_ip6_neighbor_rpc (vm, sw_if_index, a, link_layer_address, + 0 /* unset */); + return 0; + } +#endif + + k.sw_if_index = sw_if_index; + k.ip6_address = a[0]; + k.pad = 0; + + vlib_worker_thread_barrier_sync (vm); + + p = mhash_get (&nm->neighbor_index_by_key, &k); + if (p == 0) + { + rv = -1; + goto out; + } + + n = pool_elt_at_index (nm->neighbor_pool, p[0]); + mhash_unset (&nm->neighbor_index_by_key, &n->key, 0); + pool_put (nm->neighbor_pool, n); + + args.table_index_or_table_id = im->fib_index_by_sw_if_index[sw_if_index]; + args.flags = IP6_ROUTE_FLAG_FIB_INDEX | IP6_ROUTE_FLAG_DEL + | IP6_ROUTE_FLAG_NEIGHBOR; + args.dst_address = a[0]; + args.dst_address_length = 128; + args.adj_index = ~0; + args.add_adj = NULL; + args.n_add_adj = 0; + ip6_add_del_route (im, &args); + out: + vlib_worker_thread_barrier_release(vm); + return rv; +} + +static void ip6_neighbor_set_unset_rpc_callback +( ip6_neighbor_set_unset_rpc_args_t * a) +{ + vlib_main_t * vm = vlib_get_main(); + if (a->is_add) + vnet_set_ip6_ethernet_neighbor (vm, a->sw_if_index, &a->addr, + a->link_layer_address, 6); + else + vnet_unset_ip6_ethernet_neighbor (vm, a->sw_if_index, &a->addr, + a->link_layer_address, 6); +} + +static int +ip6_neighbor_sort (void *a1, void *a2) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_neighbor_t * n1 = a1, * n2 = a2; + int cmp; + cmp = vnet_sw_interface_compare (vnm, n1->key.sw_if_index, + n2->key.sw_if_index); + if (! cmp) + cmp = ip6_address_compare (&n1->key.ip6_address, &n2->key.ip6_address); + return cmp; +} + +static clib_error_t * +show_ip6_neighbors (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + ip6_neighbor_t * n, * ns; + clib_error_t * error = 0; + u32 sw_if_index; + + /* Filter entries by interface if given. */ + sw_if_index = ~0; + (void) unformat_user (input, unformat_vnet_sw_interface, vnm, &sw_if_index); + + ns = 0; + pool_foreach (n, nm->neighbor_pool, ({ vec_add1 (ns, n[0]); })); + vec_sort_with_function (ns, ip6_neighbor_sort); + vlib_cli_output (vm, "%U", format_ip6_neighbor_ip6_entry, vm, 0); + vec_foreach (n, ns) { + if (sw_if_index != ~0 && n->key.sw_if_index != sw_if_index) + continue; + vlib_cli_output (vm, "%U", format_ip6_neighbor_ip6_entry, vm, n); + } + vec_free (ns); + + return error; +} + +VLIB_CLI_COMMAND (show_ip6_neighbors_command, static) = { + .path = "show ip6 neighbors", + .function = show_ip6_neighbors, + .short_help = "Show ip6 neighbors", +}; + +static clib_error_t * +set_ip6_neighbor (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_address_t addr; + u8 mac_address[6]; + int addr_valid = 0; + int is_del = 0; + u32 sw_if_index; + + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + /* intfc, ip6-address, mac-address */ + if (unformat (input, "%U %U %U", + unformat_vnet_sw_interface, vnm, &sw_if_index, + unformat_ip6_address, &addr, + unformat_ethernet_address, mac_address)) + addr_valid = 1; + + else if (unformat (input, "delete") || unformat (input, "del")) + is_del = 1; + else + break; + } + + if (!addr_valid) + return clib_error_return (0, "Missing interface, ip6 or hw address"); + + if (!is_del) + vnet_set_ip6_ethernet_neighbor (vm, sw_if_index, &addr, + mac_address, sizeof(mac_address)); + else + vnet_unset_ip6_ethernet_neighbor (vm, sw_if_index, &addr, + mac_address, sizeof(mac_address)); + return 0; +} + +VLIB_CLI_COMMAND (set_ip6_neighbor_command, static) = { + .path = "set ip6 neighbor", + .function = set_ip6_neighbor, + .short_help = "set ip6 neighbor [del] <intfc> <ip6-address> <mac-address>", +}; + +typedef enum { + ICMP6_NEIGHBOR_SOLICITATION_NEXT_DROP, + ICMP6_NEIGHBOR_SOLICITATION_NEXT_REPLY, + ICMP6_NEIGHBOR_SOLICITATION_N_NEXT, +} icmp6_neighbor_solicitation_or_advertisement_next_t; + +static_always_inline uword +icmp6_neighbor_solicitation_or_advertisement (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + uword is_solicitation) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_main_t * im = &ip6_main; + ip_lookup_main_t * lm = &im->lookup_main; + uword n_packets = frame->n_vectors; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next_index, n_advertisements_sent; + icmp6_neighbor_discovery_option_type_t option_type; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_icmp_input_node.index); + int bogus_length; + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (icmp6_input_trace_t)); + + option_type = + (is_solicitation + ? ICMP6_NEIGHBOR_DISCOVERY_OPTION_source_link_layer_address + : ICMP6_NEIGHBOR_DISCOVERY_OPTION_target_link_layer_address); + n_advertisements_sent = 0; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip0; + icmp6_neighbor_solicitation_or_advertisement_header_t * h0; + icmp6_neighbor_discovery_ethernet_link_layer_address_option_t * o0; + u32 bi0, options_len0, sw_if_index0, next0, error0; + u32 ip6_sadd_link_local, ip6_sadd_unspecified; + int is_rewrite0; + u32 ni0; + + bi0 = to_next[0] = from[0]; + + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (p0); + h0 = ip6_next_header (ip0); + options_len0 = clib_net_to_host_u16 (ip0->payload_length) - sizeof (h0[0]); + + error0 = ICMP6_ERROR_NONE; + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + ip6_sadd_link_local = ip6_address_is_link_local_unicast(&ip0->src_address); + ip6_sadd_unspecified = ip6_address_is_unspecified (&ip0->src_address); + + /* Check that source address is unspecified, link-local or else on-link. */ + if (!ip6_sadd_unspecified && !ip6_sadd_link_local) + { + u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0); + ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main, src_adj_index0); + + /* Allow all realistic-looking rewrite adjacencies to pass */ + ni0 = adj0->lookup_next_index; + is_rewrite0 = (ni0 >= IP_LOOKUP_NEXT_ARP) && + (ni0 < IP_LOOKUP_N_NEXT); + + error0 = ((adj0->rewrite_header.sw_if_index != sw_if_index0 + || ! is_rewrite0) + ? ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_NOT_ON_LINK + : error0); + } + + o0 = (void *) (h0 + 1); + o0 = ((options_len0 == 8 && o0->header.type == option_type + && o0->header.n_data_u64s == 1) ? o0 : 0); + + /* If src address unspecified or link local, donot learn neighbor MAC */ + if (PREDICT_TRUE (error0 == ICMP6_ERROR_NONE && o0 != 0 && + !ip6_sadd_unspecified && !ip6_sadd_link_local)) + { + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + if (nm->limit_neighbor_cache_size && + pool_elts (nm->neighbor_pool) >= nm->limit_neighbor_cache_size) + unset_random_neighbor_entry(); + vnet_set_ip6_ethernet_neighbor ( + vm, sw_if_index0, + is_solicitation ? &ip0->src_address : &h0->target_address, + o0->ethernet_address, sizeof (o0->ethernet_address)); + } + + if (is_solicitation && error0 == ICMP6_ERROR_NONE) + { + /* Check that target address is one that we know about. */ + ip_interface_address_t * ia0; + ip6_address_fib_t ip6_af0; + void * oldheap; + + ip6_addr_fib_init (&ip6_af0, &h0->target_address, + vec_elt (im->fib_index_by_sw_if_index, + sw_if_index0)); + + /* Gross kludge, "thank you" MJ, don't even ask */ + oldheap = clib_mem_set_heap (clib_per_cpu_mheaps[0]); + ia0 = ip_get_interface_address (lm, &ip6_af0); + clib_mem_set_heap (oldheap); + error0 = ia0 == 0 ? + ICMP6_ERROR_NEIGHBOR_SOLICITATION_SOURCE_UNKNOWN : error0; + } + + if (is_solicitation) + next0 = (error0 != ICMP6_ERROR_NONE + ? ICMP6_NEIGHBOR_SOLICITATION_NEXT_DROP + : ICMP6_NEIGHBOR_SOLICITATION_NEXT_REPLY); + else + { + next0 = 0; + error0 = error0 == ICMP6_ERROR_NONE ? + ICMP6_ERROR_NEIGHBOR_ADVERTISEMENTS_RX : error0; + } + + if (is_solicitation && error0 == ICMP6_ERROR_NONE) + { + vnet_sw_interface_t * sw_if0; + ethernet_interface_t * eth_if0; + ethernet_header_t *eth0; + + /* dst address is either source address or the all-nodes mcast addr */ + if(!ip6_sadd_unspecified) + ip0->dst_address = ip0->src_address; + else + ip6_set_reserved_multicast_address(&ip0->dst_address, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_all_hosts); + + ip0->src_address = h0->target_address; + ip0->hop_limit = 255; + h0->icmp.type = ICMP6_neighbor_advertisement; + + sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index0); + ASSERT (sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE); + eth_if0 = ethernet_get_interface (ðernet_main, sw_if0->hw_if_index); + if (eth_if0 && o0) + { + memcpy (o0->ethernet_address, eth_if0->address, 6); + o0->header.type = + ICMP6_NEIGHBOR_DISCOVERY_OPTION_target_link_layer_address; + } + + h0->advertisement_flags = clib_host_to_net_u32 + (ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_SOLICITED + | ICMP6_NEIGHBOR_ADVERTISEMENT_FLAG_OVERRIDE); + + h0->icmp.checksum = 0; + h0->icmp.checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, p0, ip0, + &bogus_length); + ASSERT(bogus_length == 0); + + /* Reuse current MAC header, copy SMAC to DMAC and + * interface MAC to SMAC */ + vlib_buffer_reset (p0); + eth0 = vlib_buffer_get_current(p0); + memcpy(eth0->dst_address, eth0->src_address, 6); + memcpy(eth0->src_address, eth_if0->address, 6); + + /* Setup input and output sw_if_index for packet */ + ASSERT(vnet_buffer(p0)->sw_if_index[VLIB_RX] == sw_if_index0); + vnet_buffer(p0)->sw_if_index[VLIB_TX] = sw_if_index0; + vnet_buffer(p0)->sw_if_index[VLIB_RX] = + vnet_main.local_interface_sw_if_index; + + n_advertisements_sent++; + } + + p0->error = error_node->errors[error0]; + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* Account for advertisements sent. */ + vlib_error_count (vm, error_node->node_index, ICMP6_ERROR_NEIGHBOR_ADVERTISEMENTS_TX, n_advertisements_sent); + + return frame->n_vectors; +} + +/* for "syslogging" - use elog for now */ +#define foreach_log_level \ + _ (DEBUG, "DEBUG") \ + _ (INFO, "INFORMATION") \ + _ (NOTICE, "NOTICE") \ + _ (WARNING, "WARNING") \ + _ (ERR, "ERROR") \ + _ (CRIT, "CRITICAL") \ + _ (ALERT, "ALERT") \ + _ (EMERG, "EMERGENCY") + +typedef enum { +#define _(f,s) LOG_##f, + foreach_log_level +#undef _ +} log_level_t; + +static char * log_level_strings[] = { +#define _(f,s) s, + foreach_log_level +#undef _ +}; + +static int logmask = 1 << LOG_DEBUG; + +static void +ip6_neighbor_syslog(vlib_main_t *vm, int priority, char * fmt, ...) +{ + /* just use elog for now */ + u8 *what; + va_list va; + + if( (priority > LOG_EMERG) || + !(logmask & (1 << priority))) + return; + + va_start (va, fmt); + if(fmt) + { + what = va_format (0, fmt, &va); + + ELOG_TYPE_DECLARE (e) = { + .format = "ip6 nd: (%s): %s", + .format_args = "T4T4", + }; + struct { u32 s[2]; } * ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->s[0] = elog_string(&vm->elog_main, log_level_strings[priority]); + ed->s[1] = elog_string(&vm->elog_main, (char *)what); + } + va_end (va); + return; +} + +/* ipv6 neighbor discovery - router advertisements */ +typedef enum { + ICMP6_ROUTER_SOLICITATION_NEXT_DROP, + ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_RW, + ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_TX, + ICMP6_ROUTER_SOLICITATION_N_NEXT, +} icmp6_router_solicitation_or_advertisement_next_t; + +static_always_inline uword +icmp6_router_solicitation(vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_main_t * im = &ip6_main; + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + uword n_packets = frame->n_vectors; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next_index; + u32 n_advertisements_sent = 0; + int bogus_length; + + icmp6_neighbor_discovery_option_type_t option_type; + + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_icmp_input_node.index); + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (icmp6_input_trace_t)); + + /* source may append his LL address */ + option_type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_source_link_layer_address; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip0; + ip6_radv_t *radv_info = 0; + + icmp6_neighbor_discovery_header_t * h0; + icmp6_neighbor_discovery_ethernet_link_layer_address_option_t * o0; + + u32 bi0, options_len0, sw_if_index0, next0, error0; + u32 is_solicitation = 1, is_dropped = 0; + u32 is_unspecified, is_link_local; + + bi0 = to_next[0] = from[0]; + + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (p0); + h0 = ip6_next_header (ip0); + options_len0 = clib_net_to_host_u16 (ip0->payload_length) - sizeof (h0[0]); + is_unspecified = ip6_address_is_unspecified (&ip0->src_address); + is_link_local = ip6_address_is_link_local_unicast (&ip0->src_address); + + error0 = ICMP6_ERROR_NONE; + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + + /* check if solicitation (not from nd_timer node) */ + if (ip6_address_is_unspecified (&ip0->dst_address)) + is_solicitation = 0; + + /* Check that source address is unspecified, link-local or else on-link. */ + if (!is_unspecified && !is_link_local) + { + u32 src_adj_index0 = ip6_src_lookup_for_packet (im, p0, ip0); + ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main, src_adj_index0); + + error0 = ((adj0->rewrite_header.sw_if_index != sw_if_index0 + || (adj0->lookup_next_index != IP_LOOKUP_NEXT_ARP + && adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE)) + ? ICMP6_ERROR_ROUTER_SOLICITATION_SOURCE_NOT_ON_LINK + : error0); + } + + /* check for source LL option and process */ + o0 = (void *) (h0 + 1); + o0 = ((options_len0 == 8 + && o0->header.type == option_type + && o0->header.n_data_u64s == 1) + ? o0 + : 0); + + /* if src address unspecified IGNORE any options */ + if (PREDICT_TRUE (error0 == ICMP6_ERROR_NONE && o0 != 0 && + !is_unspecified && !is_link_local)) { + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + if (nm->limit_neighbor_cache_size && + pool_elts (nm->neighbor_pool) >= nm->limit_neighbor_cache_size) + unset_random_neighbor_entry(); + + vnet_set_ip6_ethernet_neighbor (vm, sw_if_index0, + &ip0->src_address, + o0->ethernet_address, + sizeof (o0->ethernet_address)); + } + + /* default is to drop */ + next0 = ICMP6_ROUTER_SOLICITATION_NEXT_DROP; + + if (error0 == ICMP6_ERROR_NONE) + { + vnet_sw_interface_t * sw_if0; + ethernet_interface_t * eth_if0; + u32 adj_index0; + + sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index0); + ASSERT (sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE); + eth_if0 = ethernet_get_interface (ðernet_main, sw_if0->hw_if_index); + + /* only support ethernet interface type for now */ + error0 = (!eth_if0) ? ICMP6_ERROR_ROUTER_SOLICITATION_UNSUPPORTED_INTF : error0; + + if (error0 == ICMP6_ERROR_NONE) + { + u32 ri; + + /* adjust the sizeof the buffer to just include the ipv6 header */ + p0->current_length -= (options_len0 + sizeof(icmp6_neighbor_discovery_header_t)); + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index0, ~0); + + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index0]; + + if(ri != ~0) + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + error0 = ((!radv_info) ? ICMP6_ERROR_ROUTER_SOLICITATION_RADV_NOT_CONFIG : error0); + + if (error0 == ICMP6_ERROR_NONE) + { + f64 now = vlib_time_now (vm); + + /* for solicited adverts - need to rate limit */ + if(is_solicitation) + { + if( (now - radv_info->last_radv_time) < MIN_DELAY_BETWEEN_RAS ) + is_dropped = 1; + else + radv_info->last_radv_time = now; + } + + /* send now */ + icmp6_router_advertisement_header_t rh; + + rh.icmp.type = ICMP6_router_advertisement; + rh.icmp.code = 0; + rh.icmp.checksum = 0; + + rh.current_hop_limit = radv_info->curr_hop_limit; + rh.router_lifetime_in_sec = clib_host_to_net_u16(radv_info->adv_router_lifetime_in_sec); + rh.time_in_msec_between_retransmitted_neighbor_solicitations = + clib_host_to_net_u32(radv_info->adv_time_in_msec_between_retransmitted_neighbor_solicitations); + rh.neighbor_reachable_time_in_msec = + clib_host_to_net_u32(radv_info->adv_neighbor_reachable_time_in_msec); + + rh.flags = (radv_info->adv_managed_flag) ? ICMP6_ROUTER_DISCOVERY_FLAG_ADDRESS_CONFIG_VIA_DHCP : 0; + rh.flags |= ( (radv_info->adv_other_flag) ? ICMP6_ROUTER_DISCOVERY_FLAG_OTHER_CONFIG_VIA_DHCP : 0); + + + u16 payload_length = sizeof(icmp6_router_advertisement_header_t); + + vlib_buffer_add_data (vm, + p0->free_list_index, + bi0, + (void *)&rh, sizeof(icmp6_router_advertisement_header_t)); + + if(radv_info->adv_link_layer_address) + { + icmp6_neighbor_discovery_ethernet_link_layer_address_option_t h; + + h.header.type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_source_link_layer_address; + h.header.n_data_u64s = 1; + + /* copy ll address */ + memcpy(&h.ethernet_address[0], eth_if0->address, 6); + + vlib_buffer_add_data (vm, + p0->free_list_index, + bi0, + (void *)&h, sizeof(icmp6_neighbor_discovery_ethernet_link_layer_address_option_t)); + + payload_length += sizeof(icmp6_neighbor_discovery_ethernet_link_layer_address_option_t); + } + + /* add MTU option */ + if(radv_info->adv_link_mtu) + { + icmp6_neighbor_discovery_mtu_option_t h; + + h.unused = 0; + h.mtu = clib_host_to_net_u32(radv_info->adv_link_mtu); + h.header.type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_mtu; + h.header.n_data_u64s = 1; + + payload_length += sizeof( icmp6_neighbor_discovery_mtu_option_t); + + vlib_buffer_add_data (vm, + p0->free_list_index, + bi0, + (void *)&h, sizeof(icmp6_neighbor_discovery_mtu_option_t)); + } + + /* add advertised prefix options */ + ip6_radv_prefix_t *pr_info; + + pool_foreach (pr_info, radv_info->adv_prefixes_pool, ({ + + if(pr_info->enabled && + (!pr_info->decrement_lifetime_flag || (pr_info->pref_lifetime_expires >0))) + { + /* advertise this prefix */ + icmp6_neighbor_discovery_prefix_information_option_t h; + + h.header.type = ICMP6_NEIGHBOR_DISCOVERY_OPTION_prefix_information; + h.header.n_data_u64s = (sizeof(icmp6_neighbor_discovery_prefix_information_option_t) >> 3); + + h.dst_address_length = pr_info->prefix_len; + + h.flags = (pr_info->adv_on_link_flag) ? ICMP6_NEIGHBOR_DISCOVERY_PREFIX_INFORMATION_FLAG_ON_LINK : 0; + h.flags |= (pr_info->adv_autonomous_flag) ? ICMP6_NEIGHBOR_DISCOVERY_PREFIX_INFORMATION_AUTO : 0; + + if(radv_info->cease_radv && pr_info->deprecated_prefix_flag) + { + h.valid_time = clib_host_to_net_u32(MIN_ADV_VALID_LIFETIME); + h.preferred_time = 0; + } + else + { + if(pr_info->decrement_lifetime_flag) + { + pr_info->adv_valid_lifetime_in_secs = ((pr_info->valid_lifetime_expires > now)) ? + (pr_info->valid_lifetime_expires - now) : 0; + + pr_info->adv_pref_lifetime_in_secs = ((pr_info->pref_lifetime_expires > now)) ? + (pr_info->pref_lifetime_expires - now) : 0; + } + + h.valid_time = clib_host_to_net_u32(pr_info->adv_valid_lifetime_in_secs); + h.preferred_time = clib_host_to_net_u32(pr_info->adv_pref_lifetime_in_secs) ; + } + h.unused = 0; + + memcpy(&h.dst_address, &pr_info->prefix, sizeof(ip6_address_t)); + + payload_length += sizeof( icmp6_neighbor_discovery_prefix_information_option_t); + + vlib_buffer_add_data (vm, + p0->free_list_index, + bi0, + (void *)&h, sizeof(icmp6_neighbor_discovery_prefix_information_option_t)); + + } + })); + + /* add additional options before here */ + + /* finish building the router advertisement... */ + if(!is_unspecified && radv_info->send_unicast) + { + ip0->dst_address = ip0->src_address; + } + else + { + /* target address is all-nodes mcast addr */ + ip6_set_reserved_multicast_address(&ip0->dst_address, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_all_hosts); + } + + /* source address MUST be the link-local address */ + ip0->src_address = radv_info->link_local_address; + + ip0->hop_limit = 255; + ip0->payload_length = clib_host_to_net_u16 (payload_length); + + icmp6_router_advertisement_header_t * rh0 = (icmp6_router_advertisement_header_t *)(ip0 + 1); + rh0->icmp.checksum = + ip6_tcp_udp_icmp_compute_checksum (vm, p0, ip0, + &bogus_length); + ASSERT(bogus_length == 0); + + /* setup output if and adjacency */ + vnet_buffer (p0)->sw_if_index[VLIB_RX] = + vnet_main.local_interface_sw_if_index; + + if (is_solicitation) + { + ethernet_header_t *eth0; + /* Reuse current MAC header, copy SMAC to DMAC and + * interface MAC to SMAC */ + vlib_buffer_reset (p0); + eth0 = vlib_buffer_get_current(p0); + memcpy(eth0->dst_address, eth0->src_address, 6); + memcpy(eth0->src_address, eth_if0->address, 6); + next0 = is_dropped ? + next0 : ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_TX; + vnet_buffer(p0)->sw_if_index[VLIB_TX] = sw_if_index0; + } + else + { + adj_index0 = radv_info->all_nodes_adj_index; + if (adj_index0 == 0) + error0 = ICMP6_ERROR_DST_LOOKUP_MISS; + else + { + ip_adjacency_t * adj0 = ip_get_adjacency (&im->lookup_main, adj_index0); + error0 = + ((adj0->rewrite_header.sw_if_index != sw_if_index0 + || adj0->lookup_next_index != IP_LOOKUP_NEXT_REWRITE) + ? ICMP6_ERROR_ROUTER_SOLICITATION_DEST_UNKNOWN + : error0); + next0 = is_dropped ? + next0 : ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_RW; + vnet_buffer (p0)->ip.adj_index[VLIB_RX] = adj_index0; + } + } + + radv_info->n_solicitations_dropped += is_dropped; + radv_info->n_solicitations_rcvd += is_solicitation; + + if((error0 == ICMP6_ERROR_NONE) && !is_dropped) + { + radv_info->n_advertisements_sent++; + n_advertisements_sent++; + } + } + } + } + + p0->error = error_node->errors[error0]; + + if(error0 != ICMP6_ERROR_NONE) + vlib_error_count (vm, error_node->node_index, error0, 1); + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* Account for router advertisements sent. */ + vlib_error_count (vm, error_node->node_index, ICMP6_ERROR_ROUTER_ADVERTISEMENTS_TX, n_advertisements_sent); + + return frame->n_vectors; +} + + /* validate advertised info for consistancy (see RFC-4861 section 6.2.7) - log any inconsistencies, packet will always be dropped */ +static_always_inline uword +icmp6_router_advertisement(vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + uword n_packets = frame->n_vectors; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next_index; + u32 n_advertisements_rcvd = 0; + + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_icmp_input_node.index); + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + next_index = node->cached_next_index; + + if (node->flags & VLIB_NODE_FLAG_TRACE) + vlib_trace_frame_buffers_only (vm, node, from, frame->n_vectors, + /* stride */ 1, + sizeof (icmp6_input_trace_t)); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip0; + ip6_radv_t *radv_info = 0; + icmp6_router_advertisement_header_t * h0; + u32 bi0, options_len0, sw_if_index0, next0, error0; + + bi0 = to_next[0] = from[0]; + + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + ip0 = vlib_buffer_get_current (p0); + h0 = ip6_next_header (ip0); + options_len0 = clib_net_to_host_u16 (ip0->payload_length) - sizeof (h0[0]); + + error0 = ICMP6_ERROR_NONE; + sw_if_index0 = vnet_buffer (p0)->sw_if_index[VLIB_RX]; + + /* Check that source address is link-local*/ + error0 = (!ip6_address_is_link_local_unicast (&ip0->src_address)) ? + ICMP6_ERROR_ROUTER_ADVERTISEMENT_SOURCE_NOT_LINK_LOCAL : error0; + + /* default is to drop */ + next0 = ICMP6_ROUTER_SOLICITATION_NEXT_DROP; + + n_advertisements_rcvd++; + + if (error0 == ICMP6_ERROR_NONE) + { + vnet_sw_interface_t * sw_if0; + ethernet_interface_t * eth_if0; + + sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index0); + ASSERT (sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE); + eth_if0 = ethernet_get_interface (ðernet_main, sw_if0->hw_if_index); + + /* only support ethernet interface type for now */ + error0 = (!eth_if0) ? ICMP6_ERROR_ROUTER_SOLICITATION_UNSUPPORTED_INTF : error0; + + if (error0 == ICMP6_ERROR_NONE) + { + u32 ri; + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index0, ~0); + + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index0]; + + if(ri != ~0) + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + error0 = ((!radv_info) ? ICMP6_ERROR_ROUTER_SOLICITATION_RADV_NOT_CONFIG : error0); + + if (error0 == ICMP6_ERROR_NONE) + { + /* validate advertised information */ + if((h0->current_hop_limit && radv_info->curr_hop_limit) && + (h0->current_hop_limit != radv_info->curr_hop_limit)) + { + ip6_neighbor_syslog(vm, LOG_WARNING, + "our AdvCurHopLimit on %U doesn't agree with %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address); + } + + if((h0->flags & ICMP6_ROUTER_DISCOVERY_FLAG_ADDRESS_CONFIG_VIA_DHCP) != + radv_info->adv_managed_flag) + { + ip6_neighbor_syslog(vm, LOG_WARNING, + "our AdvManagedFlag on %U doesn't agree with %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address); + } + + if((h0->flags & ICMP6_ROUTER_DISCOVERY_FLAG_OTHER_CONFIG_VIA_DHCP) != + radv_info->adv_other_flag) + { + ip6_neighbor_syslog(vm, LOG_WARNING, + "our AdvOtherConfigFlag on %U doesn't agree with %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address); + } + + if((h0->time_in_msec_between_retransmitted_neighbor_solicitations && + radv_info->adv_time_in_msec_between_retransmitted_neighbor_solicitations) && + (h0->time_in_msec_between_retransmitted_neighbor_solicitations != + clib_host_to_net_u32(radv_info->adv_time_in_msec_between_retransmitted_neighbor_solicitations))) + { + ip6_neighbor_syslog(vm, LOG_WARNING, + "our AdvRetransTimer on %U doesn't agree with %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address); + } + + if((h0->neighbor_reachable_time_in_msec && + radv_info->adv_neighbor_reachable_time_in_msec) && + (h0->neighbor_reachable_time_in_msec != + clib_host_to_net_u32(radv_info->adv_neighbor_reachable_time_in_msec))) + { + ip6_neighbor_syslog(vm, LOG_WARNING, + "our AdvReachableTime on %U doesn't agree with %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address); + } + + /* check for MTU or prefix options or .. */ + u8 * opt_hdr = (u8 *)(h0 + 1); + while( options_len0 > 0) + { + icmp6_neighbor_discovery_option_header_t *o0 = ( icmp6_neighbor_discovery_option_header_t *)opt_hdr; + int opt_len = o0->n_data_u64s << 3; + icmp6_neighbor_discovery_option_type_t option_type = o0->type; + + if(options_len0 < 2) + { + ip6_neighbor_syslog(vm, LOG_ERR, + "malformed RA packet on %U from %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address); + break; + } + + if(opt_len == 0) + { + ip6_neighbor_syslog(vm, LOG_ERR, + " zero length option in RA on %U from %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address); + break; + } + else if( opt_len > options_len0) + { + ip6_neighbor_syslog(vm, LOG_ERR, + "option length in RA packet greater than total length on %U from %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address); + break; + } + + options_len0 -= opt_len; + opt_hdr += opt_len; + + switch(option_type) + { + case ICMP6_NEIGHBOR_DISCOVERY_OPTION_mtu: + { + icmp6_neighbor_discovery_mtu_option_t *h = + (icmp6_neighbor_discovery_mtu_option_t *)(o0); + + if(opt_len < sizeof(*h)) + break; + + if((h->mtu && radv_info->adv_link_mtu) && + (h->mtu != clib_host_to_net_u32(radv_info->adv_link_mtu))) + { + ip6_neighbor_syslog(vm, LOG_WARNING, + "our AdvLinkMTU on %U doesn't agree with %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0, format_ip6_address, &ip0->src_address); + } + } + break; + + case ICMP6_NEIGHBOR_DISCOVERY_OPTION_prefix_information: + { + icmp6_neighbor_discovery_prefix_information_option_t *h = + (icmp6_neighbor_discovery_prefix_information_option_t *)(o0); + + /* validate advertised prefix options */ + ip6_radv_prefix_t *pr_info; + u32 preferred, valid; + + if(opt_len < sizeof(*h)) + break; + + preferred = clib_net_to_host_u32(h->preferred_time); + valid = clib_net_to_host_u32(h->valid_time); + + /* look for matching prefix - if we our advertising it, it better be consistant */ + pool_foreach (pr_info, radv_info->adv_prefixes_pool, ({ + + ip6_address_t mask; + ip6_address_mask_from_width(&mask, pr_info->prefix_len); + + if(pr_info->enabled && + (pr_info->prefix_len == h->dst_address_length) && + ip6_address_is_equal_masked (&pr_info->prefix, &h->dst_address, &mask)) + { + /* found it */ + if(!pr_info->decrement_lifetime_flag && + valid != pr_info->adv_valid_lifetime_in_secs) + { + ip6_neighbor_syslog(vm, LOG_WARNING, + "our ADV validlifetime on %U for %U does not agree with %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0,format_ip6_address, &pr_info->prefix, + format_ip6_address, &h->dst_address); + } + if(!pr_info->decrement_lifetime_flag && + preferred != pr_info->adv_pref_lifetime_in_secs) + { + ip6_neighbor_syslog(vm, LOG_WARNING, + "our ADV preferredlifetime on %U for %U does not agree with %U", + format_vnet_sw_if_index_name, vnm, sw_if_index0,format_ip6_address, &pr_info->prefix, + format_ip6_address, &h->dst_address); + } + } + break; + })); + break; + } + default: + /* skip this one */ + break; + } + } + } + } + } + + p0->error = error_node->errors[error0]; + + if(error0 != ICMP6_ERROR_NONE) + vlib_error_count (vm, error_node->node_index, error0, 1); + + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + /* Account for router advertisements sent. */ + vlib_error_count (vm, error_node->node_index, ICMP6_ERROR_ROUTER_ADVERTISEMENTS_RX, n_advertisements_rcvd); + + return frame->n_vectors; +} + +/* create and initialize router advertisement parameters with default values for this intfc */ +static u32 +ip6_neighbor_sw_interface_add_del (vnet_main_t * vnm, + u32 sw_if_index, + u32 is_add) +{ + ip6_main_t * im = &ip6_main; + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + ip_lookup_main_t * lm = &im->lookup_main; + ip6_radv_t * a= 0; + u32 ri = ~0;; + vnet_sw_interface_t * sw_if0; + ethernet_interface_t * eth_if0 = 0; + + /* lookup radv container - ethernet interfaces only */ + sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index); + if(sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE) + eth_if0 = ethernet_get_interface (ðernet_main, sw_if0->hw_if_index); + + if(!eth_if0) + return ri; + + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + + if(ri != ~0) + { + a = pool_elt_at_index (nm->if_radv_pool, ri); + + if(!is_add) + { + u32 i, * to_delete = 0; + ip6_radv_prefix_t *p; + ip6_mldp_group_t *m; + + /* remove adjacencies */ + ip_del_adjacency (lm, a->all_nodes_adj_index); + ip_del_adjacency (lm, a->all_routers_adj_index); + ip_del_adjacency (lm, a->all_mldv2_routers_adj_index); + + /* clean up prefix_pool */ + pool_foreach (p, a->adv_prefixes_pool, ({ + vec_add1 (to_delete, p - a->adv_prefixes_pool); + })); + + for (i = 0; i < vec_len (to_delete); i++) + { + p = pool_elt_at_index (a->adv_prefixes_pool, to_delete[i]); + mhash_unset (&a->address_to_prefix_index, &p->prefix, 0); + pool_put (a->adv_prefixes_pool, p); + } + + vec_free (to_delete); + to_delete = 0; + + /* clean up mldp group pool */ + pool_foreach (m, a->mldp_group_pool, ({ + vec_add1 (to_delete, m - a->mldp_group_pool); + })); + + for (i = 0; i < vec_len (to_delete); i++) + { + m = pool_elt_at_index (a->mldp_group_pool, to_delete[i]); + mhash_unset (&a->address_to_mldp_index, &m->mcast_address, 0); + pool_put (a->mldp_group_pool, m); + } + + vec_free (to_delete); + + pool_put (nm->if_radv_pool, a); + nm->if_radv_pool_index_by_sw_if_index[sw_if_index] = ~0; + ri = ~0; + } + } + else + { + if(is_add) + { + vnet_hw_interface_t * hw_if0; + + hw_if0 = vnet_get_sup_hw_interface (vnm, sw_if_index); + + pool_get (nm->if_radv_pool, a); + + ri = a - nm->if_radv_pool; + nm->if_radv_pool_index_by_sw_if_index[sw_if_index] = ri; + + /* initialize default values (most of which are zero) */ + memset (a, 0, sizeof (a[0])); + + a->sw_if_index = sw_if_index; + a->fib_index = ~0; + a->max_radv_interval = DEF_MAX_RADV_INTERVAL; + a->min_radv_interval = DEF_MIN_RADV_INTERVAL; + a->curr_hop_limit = DEF_CURR_HOP_LIMIT; + a->adv_router_lifetime_in_sec = DEF_DEF_RTR_LIFETIME; + + a->adv_link_layer_address = 1; /* send ll address source address option */ + + a->min_delay_between_radv = MIN_DELAY_BETWEEN_RAS; + a->max_delay_between_radv = MAX_DELAY_BETWEEN_RAS; + a->max_rtr_default_lifetime = MAX_DEF_RTR_LIFETIME; + a->seed = random_default_seed(); + + /* for generating random interface ids */ + a->randomizer = 0x1119194911191949; + a->randomizer = random_u64 ((u32 *)&a->randomizer); + + a->initial_adverts_count = MAX_INITIAL_RTR_ADVERTISEMENTS ; + a->initial_adverts_sent = a->initial_adverts_count-1; + a->initial_adverts_interval = MAX_INITIAL_RTR_ADVERT_INTERVAL; + + /* deafult is to send */ + a->send_radv = 1; + + /* fill in radv_info for this interface that will be needed later */ + a->adv_link_mtu = hw_if0->max_l3_packet_bytes[VLIB_RX]; + + memcpy (a->link_layer_address, eth_if0->address, 6); + + /* fill in default link-local address (this may be overridden) */ + ip6_link_local_address_from_ethernet_address (&a->link_local_address, eth_if0->address); + a->link_local_prefix_len = 64; + + mhash_init (&a->address_to_prefix_index, sizeof (uword), sizeof (ip6_address_t)); + mhash_init (&a->address_to_mldp_index, sizeof (uword), sizeof (ip6_address_t)); + + { + ip_adjacency_t *adj; + u8 link_layer_address[6] = + {0x33, 0x33, 0x00, 0x00, 0x00, IP6_MULTICAST_GROUP_ID_all_hosts}; + + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &a->all_nodes_adj_index); + + adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE; + adj->if_address_index = ~0; + + vnet_rewrite_for_sw_interface + (vnm, + VNET_L3_PACKET_TYPE_IP6, + sw_if_index, + ip6_rewrite_node.index, + link_layer_address, + &adj->rewrite_header, + sizeof (adj->rewrite_data)); + } + + { + ip_adjacency_t *adj; + u8 link_layer_address[6] = + {0x33, 0x33, 0x00, 0x00, 0x00, IP6_MULTICAST_GROUP_ID_all_routers}; + + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &a->all_routers_adj_index); + + adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE; + adj->if_address_index = ~0; + + vnet_rewrite_for_sw_interface + (vnm, + VNET_L3_PACKET_TYPE_IP6, + sw_if_index, + ip6_rewrite_node.index, + link_layer_address, + &adj->rewrite_header, + sizeof (adj->rewrite_data)); + } + + { + ip_adjacency_t *adj; + u8 link_layer_address[6] = + {0x33, 0x33, 0x00, 0x00, 0x00, IP6_MULTICAST_GROUP_ID_mldv2_routers}; + + adj = ip_add_adjacency (lm, /* template */ 0, /* block size */ 1, + &a->all_mldv2_routers_adj_index); + + adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE; + adj->if_address_index = ~0; + + vnet_rewrite_for_sw_interface + (vnm, + VNET_L3_PACKET_TYPE_IP6, + sw_if_index, + ip6_rewrite_node.index, + link_layer_address, + &adj->rewrite_header, + sizeof (adj->rewrite_data)); + } + + /* add multicast groups we will always be reporting */ + ip6_address_t addr; + ip6_mldp_group_t *mcast_group_info; + + ip6_set_reserved_multicast_address (&addr, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_all_hosts); + + /* lookup mldp info for this interface */ + + uword * p = mhash_get (&a->address_to_mldp_index, &addr); + mcast_group_info = p ? pool_elt_at_index (a->mldp_group_pool, p[0]) : 0; + + /* add address */ + if(!mcast_group_info) + { + /* add */ + u32 mi; + pool_get (a->mldp_group_pool, mcast_group_info); + + mi = mcast_group_info - a->mldp_group_pool; + mhash_set (&a->address_to_mldp_index, &addr, mi, /* old_value */ 0); + + mcast_group_info->type = 4; + mcast_group_info->mcast_source_address_pool = 0; + mcast_group_info->num_sources = 0; + memcpy(&mcast_group_info->mcast_address, &addr, sizeof(ip6_address_t)); + } + + ip6_set_reserved_multicast_address (&addr, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_all_routers); + + p = mhash_get (&a->address_to_mldp_index, &addr); + mcast_group_info = p ? pool_elt_at_index (a->mldp_group_pool, p[0]) : 0; + + if(!mcast_group_info) + { + /* add */ + u32 mi; + pool_get (a->mldp_group_pool, mcast_group_info); + + mi = mcast_group_info - a->mldp_group_pool; + mhash_set (&a->address_to_mldp_index, &addr, mi, /* old_value */ 0); + + mcast_group_info->type = 4; + mcast_group_info->mcast_source_address_pool = 0; + mcast_group_info->num_sources = 0; + memcpy(&mcast_group_info->mcast_address, &addr, sizeof(ip6_address_t)); + } + + ip6_set_reserved_multicast_address (&addr, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_mldv2_routers); + + p = mhash_get (&a->address_to_mldp_index, &addr); + mcast_group_info = p ? pool_elt_at_index (a->mldp_group_pool, p[0]) : 0; + + if(!mcast_group_info) + { + /* add */ + u32 mi; + pool_get (a->mldp_group_pool, mcast_group_info); + + mi = mcast_group_info - a->mldp_group_pool; + mhash_set (&a->address_to_mldp_index, &addr, mi, /* old_value */ 0); + + mcast_group_info->type = 4; + mcast_group_info->mcast_source_address_pool = 0; + mcast_group_info->num_sources = 0; + memcpy(&mcast_group_info->mcast_address, &addr, sizeof(ip6_address_t)); + } + } + } + return ri; +} + +/* send an mldpv2 report */ +static void +ip6_neighbor_send_mldpv2_report(u32 sw_if_index) +{ + vnet_main_t * vnm = vnet_get_main(); + vlib_main_t * vm = vnm->vlib_main; + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + vnet_sw_interface_t * sw_if0; + ethernet_interface_t * eth_if0; + u32 ri; + int bogus_length; + + ip6_radv_t *radv_info; + u16 payload_length; + vlib_buffer_t * b0; + ip6_header_t * ip0; + u32 * to_next; + vlib_frame_t * f; + u32 bo0; + u32 n_to_alloc = 1; + u32 n_allocated; + + icmp6_multicast_listener_report_header_t *rh0; + icmp6_multicast_listener_report_packet_t *rp0; + + sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index); + ASSERT (sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE); + eth_if0 = ethernet_get_interface (ðernet_main, sw_if0->hw_if_index); + + if (!eth_if0 || !vnet_sw_interface_is_admin_up (vnm, sw_if_index)) + return; + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + + if(ri == ~0) + return; + + /* send report now - build a mldpv2 report packet */ + n_allocated = vlib_buffer_alloc_from_free_list(vm, + &bo0, + n_to_alloc, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + if (PREDICT_FALSE(n_allocated == 0)) + { + clib_warning ("buffer allocation failure"); + return; + } + + b0 = vlib_get_buffer (vm, bo0); + + /* adjust the sizeof the buffer to just include the ipv6 header */ + b0->current_length = sizeof(icmp6_multicast_listener_report_packet_t); + + payload_length = sizeof(icmp6_multicast_listener_report_header_t); + + b0->error = ICMP6_ERROR_NONE; + + rp0 = vlib_buffer_get_current (b0); + ip0 = (ip6_header_t *)&rp0-> ip; + rh0 = (icmp6_multicast_listener_report_header_t *)&rp0-> report_hdr; + + memset (rp0 , 0x0, sizeof (icmp6_multicast_listener_report_packet_t)); + + ip0->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6 << 28); + + ip0->protocol = IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS; + /* for DEBUG - vnet driver won't seem to emit router alerts */ + /* ip0->protocol = IP_PROTOCOL_ICMP6; */ + ip0->hop_limit = 1; + + rh0->icmp.type = ICMP6_multicast_listener_report_v2; + + /* source address MUST be the link-local address */ + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + ip0->src_address = radv_info->link_local_address; + + /* destination is all mldpv2 routers */ + ip6_set_reserved_multicast_address(&ip0->dst_address, + IP6_MULTICAST_SCOPE_link_local, + IP6_MULTICAST_GROUP_ID_mldv2_routers); + + /* add reports here */ + ip6_mldp_group_t *m; + int num_addr_records = 0; + icmp6_multicast_address_record_t rr; + + /* fill in the hop-by-hop extension header (router alert) info */ + rh0->ext_hdr.next_hdr = IP_PROTOCOL_ICMP6; + rh0->ext_hdr.n_data_u64s = 0; + + rh0->alert.type = IP6_MLDP_ALERT_TYPE; + rh0->alert.len = 2; + rh0->alert.value = 0; + + rh0->pad.type = 1; + rh0->pad.len = 0; + + rh0->icmp.checksum = 0; + + pool_foreach (m, radv_info->mldp_group_pool, ({ + + rr.type = m->type; + rr.aux_data_len_u32s = 0; + rr.num_sources = clib_host_to_net_u16 (m->num_sources); + memcpy(&rr.mcast_addr, &m->mcast_address, sizeof(ip6_address_t)); + + num_addr_records++; + + vlib_buffer_add_data (vm, + b0->free_list_index, + bo0, + (void *)&rr, sizeof(icmp6_multicast_address_record_t)); + + payload_length += sizeof( icmp6_multicast_address_record_t); + })); + + rh0->rsvd = 0; + rh0->num_addr_records = clib_host_to_net_u16(num_addr_records); + + /* update lengths */ + ip0->payload_length = clib_host_to_net_u16 (payload_length); + + rh0->icmp.checksum = ip6_tcp_udp_icmp_compute_checksum (vm, b0, ip0, + &bogus_length); + ASSERT(bogus_length == 0); + + /* + * OK to override w/ no regard for actual FIB, because + * ip6-rewrite-local only looks at the adjacency. + */ + vnet_buffer (b0)->sw_if_index[VLIB_RX] = + vnet_main.local_interface_sw_if_index; + + vnet_buffer (b0)->ip.adj_index[VLIB_RX] = + radv_info->all_mldv2_routers_adj_index; + + vlib_node_t * node = vlib_get_node_by_name (vm, (u8 *) "ip6-rewrite-local"); + + f = vlib_get_frame_to_node (vm, node->index); + to_next = vlib_frame_vector_args (f); + to_next[0] = bo0; + f->n_vectors = 1; + + vlib_put_frame_to_node (vm, node->index, f); + return; +} + +VLIB_REGISTER_NODE (ip6_icmp_router_solicitation_node,static) = { + .function = icmp6_router_solicitation, + .name = "icmp6-router-solicitation", + + .vector_size = sizeof (u32), + + .format_trace = format_icmp6_input_trace, + + .n_next_nodes = ICMP6_ROUTER_SOLICITATION_N_NEXT, + .next_nodes = { + [ICMP6_ROUTER_SOLICITATION_NEXT_DROP] = "error-drop", + [ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_RW] = "ip6-rewrite-local", + [ICMP6_ROUTER_SOLICITATION_NEXT_REPLY_TX] = "interface-output", + }, +}; + +/* send a RA or update the timer info etc.. */ +static uword +ip6_neighbor_process_timer_event (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + ip6_radv_t *radv_info; + vlib_frame_t * f = 0; + u32 n_this_frame = 0; + u32 n_left_to_next; + u32 * to_next; + u32 bo0; + icmp6_router_solicitation_header_t * h0; + vlib_buffer_t * b0; + f64 now = vlib_time_now (vm); + + /* Interface ip6 radv info list */ + pool_foreach (radv_info, nm->if_radv_pool, ({ + + if( !vnet_sw_interface_is_admin_up (vnm, radv_info->sw_if_index)) + { + radv_info->initial_adverts_sent = radv_info->initial_adverts_count-1; + radv_info->next_multicast_time = now; + radv_info->last_multicast_time = now; + radv_info->last_radv_time = 0; + radv_info->all_routers_mcast = 0; + continue; + } + + /* Make sure that we've joined the all-routers multicast group */ + if(!radv_info->all_routers_mcast) + { + /* send MDLP_REPORT_EVENT message */ + ip6_neighbor_send_mldpv2_report(radv_info->sw_if_index); + radv_info->all_routers_mcast = 1; + } + + /* is it time to send a multicast RA on this interface? */ + if(radv_info->send_radv && (now >= radv_info->next_multicast_time)) + { + u32 n_to_alloc = 1; + u32 n_allocated; + + f64 rfn = (radv_info->max_radv_interval - radv_info->min_radv_interval) * + random_f64 (&radv_info->seed) + radv_info->min_radv_interval; + + /* multicast send - compute next multicast send time */ + if( radv_info->initial_adverts_sent > 0) + { + radv_info->initial_adverts_sent--; + if(rfn > radv_info-> initial_adverts_interval) + rfn = radv_info-> initial_adverts_interval; + + /* check to see if we are ceasing to send */ + if( radv_info->initial_adverts_sent == 0) + if(radv_info->cease_radv) + radv_info->send_radv = 0; + } + + radv_info->next_multicast_time = rfn + now; + radv_info->last_multicast_time = now; + + /* send advert now - build a "solicted" router advert with unspecified source address */ + n_allocated = vlib_buffer_alloc_from_free_list(vm, + &bo0, + n_to_alloc, + VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX); + + if (PREDICT_FALSE(n_allocated == 0)) + { + clib_warning ("buffer allocation failure"); + continue; + } + b0 = vlib_get_buffer (vm, bo0); + b0->current_length = sizeof( icmp6_router_solicitation_header_t); + b0->error = ICMP6_ERROR_NONE; + vnet_buffer (b0)->sw_if_index[VLIB_RX] = radv_info->sw_if_index; + + h0 = vlib_buffer_get_current (b0); + + memset (h0, 0, sizeof (icmp6_router_solicitation_header_t)); + + h0->ip.ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6 << 28); + h0->ip.payload_length = clib_host_to_net_u16 (sizeof (icmp6_router_solicitation_header_t) + - STRUCT_OFFSET_OF (icmp6_router_solicitation_header_t, neighbor)); + h0->ip.protocol = IP_PROTOCOL_ICMP6; + h0->ip.hop_limit = 255; + + /* set src/dst address as "unspecified" this marks this packet as internally generated rather than recieved */ + h0->ip.src_address.as_u64[0] = 0; + h0->ip.src_address.as_u64[1] = 0; + + h0->ip.dst_address.as_u64[0] = 0; + h0->ip.dst_address.as_u64[1] = 0; + + h0->neighbor.icmp.type = ICMP6_router_solicitation; + + if (PREDICT_FALSE(f == 0)) + { + f = vlib_get_frame_to_node (vm, ip6_icmp_router_solicitation_node.index); + to_next = vlib_frame_vector_args (f); + n_left_to_next = VLIB_FRAME_SIZE; + n_this_frame = 0; + } + + n_this_frame++; + n_left_to_next--; + to_next[0] = bo0; + to_next += 1; + + if (PREDICT_FALSE(n_left_to_next == 0)) + { + f->n_vectors = n_this_frame; + vlib_put_frame_to_node (vm, ip6_icmp_router_solicitation_node.index, f); + f = 0; + } + } + })); + + if (f) + { + ASSERT(n_this_frame); + f->n_vectors = n_this_frame; + vlib_put_frame_to_node (vm, ip6_icmp_router_solicitation_node.index, f); + } + return 0; +} + +static uword +ip6_icmp_neighbor_discovery_event_process (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + uword event_type; + ip6_icmp_neighbor_discovery_event_data_t * event_data; + + /* init code here */ + + while (1) + { + vlib_process_wait_for_event_or_clock (vm, 1. /* seconds */); + + event_data = vlib_process_get_event_data (vm, &event_type); + + if(!event_data) + { + /* No events found: timer expired. */ + /* process interface list and send RAs as appropriate, update timer info */ + ip6_neighbor_process_timer_event (vm, node, frame); + } + else + { + switch (event_type) { + + case ICMP6_ND_EVENT_INIT: + break; + + case ~0: + break; + + default: + ASSERT (0); + } + + if (event_data) + _vec_len (event_data) = 0; + } + } + return frame->n_vectors; +} + +VLIB_REGISTER_NODE (ip6_icmp_router_advertisement_node,static) = { + .function = icmp6_router_advertisement, + .name = "icmp6-router-advertisement", + + .vector_size = sizeof (u32), + + .format_trace = format_icmp6_input_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; + +vlib_node_registration_t ip6_icmp_neighbor_discovery_event_node = { + + .function = ip6_icmp_neighbor_discovery_event_process, + .name = "ip6-icmp-neighbor-discovery-event-process", + .type = VLIB_NODE_TYPE_PROCESS, +}; + +static uword +icmp6_neighbor_solicitation (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return icmp6_neighbor_solicitation_or_advertisement (vm, node, frame, /* is_solicitation */ 1); } + +static uword +icmp6_neighbor_advertisement (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return icmp6_neighbor_solicitation_or_advertisement (vm, node, frame, /* is_solicitation */ 0); } + +VLIB_REGISTER_NODE (ip6_icmp_neighbor_solicitation_node,static) = { + .function = icmp6_neighbor_solicitation, + .name = "icmp6-neighbor-solicitation", + + .vector_size = sizeof (u32), + + .format_trace = format_icmp6_input_trace, + + .n_next_nodes = ICMP6_NEIGHBOR_SOLICITATION_N_NEXT, + .next_nodes = { + [ICMP6_NEIGHBOR_SOLICITATION_NEXT_DROP] = "error-drop", + [ICMP6_NEIGHBOR_SOLICITATION_NEXT_REPLY] = "interface-output", + }, +}; + +VLIB_REGISTER_NODE (ip6_icmp_neighbor_advertisement_node,static) = { + .function = icmp6_neighbor_advertisement, + .name = "icmp6-neighbor-advertisement", + + .vector_size = sizeof (u32), + + .format_trace = format_icmp6_input_trace, + + .n_next_nodes = 1, + .next_nodes = { + [0] = "error-drop", + }, +}; + +/* API support functions */ +int +ip6_neighbor_ra_config(vlib_main_t * vm, u32 sw_if_index, + u8 surpress, u8 managed, u8 other, + u8 ll_option, u8 send_unicast, u8 cease, + u8 use_lifetime, u32 lifetime, + u32 initial_count, u32 initial_interval, + u32 max_interval, u32 min_interval, + u8 is_no) +{ + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + int error; + u32 ri; + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + error = (ri != ~0) ? 0 : VNET_API_ERROR_INVALID_SW_IF_INDEX; + + if(!error) + { + + ip6_radv_t * radv_info; + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + if((max_interval != 0) && (min_interval ==0)) + min_interval = .75 * max_interval; + + max_interval = (max_interval != 0) ? ( (is_no) ? DEF_MAX_RADV_INTERVAL : max_interval) : radv_info->max_radv_interval; + min_interval = (min_interval != 0) ? ( (is_no) ? DEF_MIN_RADV_INTERVAL : min_interval) : radv_info->min_radv_interval; + lifetime = (use_lifetime != 0) ? ( (is_no) ? DEF_DEF_RTR_LIFETIME : lifetime) : radv_info->adv_router_lifetime_in_sec; + + if(lifetime) + { + if(lifetime > MAX_DEF_RTR_LIFETIME) + lifetime = MAX_DEF_RTR_LIFETIME; + + if(lifetime <= max_interval) + return VNET_API_ERROR_INVALID_VALUE; + } + + if(min_interval != 0) + { + if((min_interval > .75 * max_interval) || + (min_interval < 3)) + return VNET_API_ERROR_INVALID_VALUE; + } + + if((initial_count > MAX_INITIAL_RTR_ADVERTISEMENTS) || + (initial_interval > MAX_INITIAL_RTR_ADVERT_INTERVAL)) + return VNET_API_ERROR_INVALID_VALUE; + + /* + if "flag" is set and is_no is true then restore default value else set value corresponding to "flag" + if "flag" is clear don't change corresponding value + */ + radv_info->send_radv = (surpress != 0) ? ( (is_no != 0) ? 1 : 0 ) : radv_info->send_radv; + radv_info->adv_managed_flag = ( managed != 0) ? ( (is_no) ? 0 : 1) : radv_info->adv_managed_flag; + radv_info->adv_other_flag = (other != 0) ? ( (is_no) ? 0: 1) : radv_info->adv_other_flag; + radv_info->adv_link_layer_address = ( ll_option != 0) ? ( (is_no) ? 1 : 0) : radv_info->adv_link_layer_address; + radv_info->send_unicast = (send_unicast != 0) ? ( (is_no) ? 0 : 1) : radv_info->send_unicast; + radv_info->cease_radv = ( cease != 0) ? ( (is_no) ? 0 : 1) : radv_info->cease_radv; + + radv_info->min_radv_interval = min_interval; + radv_info->max_radv_interval = max_interval; + radv_info->adv_router_lifetime_in_sec = lifetime; + + radv_info->initial_adverts_count = + (initial_count != 0) ? ( (is_no) ? MAX_INITIAL_RTR_ADVERTISEMENTS : initial_count) : radv_info->initial_adverts_count ; + radv_info->initial_adverts_interval = + (initial_interval != 0) ? ( (is_no) ? MAX_INITIAL_RTR_ADVERT_INTERVAL : initial_interval) : radv_info->initial_adverts_interval; + + /* restart */ + if((cease != 0) && (is_no)) + radv_info-> send_radv = 1; + + radv_info->initial_adverts_sent = radv_info->initial_adverts_count -1; + radv_info->next_multicast_time = vlib_time_now (vm); + radv_info->last_multicast_time = vlib_time_now (vm); + radv_info->last_radv_time = 0; + } + return(error); +} + +int +ip6_neighbor_ra_prefix(vlib_main_t * vm, u32 sw_if_index, + ip6_address_t *prefix_addr, u8 prefix_len, + u8 use_default, u32 val_lifetime, u32 pref_lifetime, + u8 no_advertise, u8 off_link, u8 no_autoconfig, u8 no_onlink, + u8 is_no) +{ + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + int error; + + u32 ri; + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + + error = (ri != ~0) ? 0 : VNET_API_ERROR_INVALID_SW_IF_INDEX; + + if(!error) + { + f64 now = vlib_time_now (vm); + ip6_radv_t * radv_info; + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + /* prefix info add, delete or update */ + ip6_radv_prefix_t * prefix; + + /* lookup prefix info for this address on this interface */ + uword * p = mhash_get (&radv_info->address_to_prefix_index, prefix_addr); + + prefix = p ? pool_elt_at_index (radv_info->adv_prefixes_pool, p[0]) : 0; + + if(is_no) + { + /* delete */ + if(!prefix) + return VNET_API_ERROR_INVALID_VALUE; /* invalid prefix */ + + if(prefix->prefix_len != prefix_len) + return VNET_API_ERROR_INVALID_VALUE_2; + + /* FIXME - Should the DP do this or the CP ?*/ + /* do specific delete processing here before returning */ + /* try to remove from routing table */ + + mhash_unset (&radv_info->address_to_prefix_index, prefix_addr,/* old_value */ 0); + pool_put (radv_info->adv_prefixes_pool, prefix); + + radv_info->initial_adverts_sent = radv_info->initial_adverts_count -1; + radv_info->next_multicast_time = vlib_time_now (vm); + radv_info->last_multicast_time = vlib_time_now (vm); + radv_info->last_radv_time = 0; + return(error); + } + + /* adding or changing */ + if(!prefix) + { + /* add */ + u32 pi; + pool_get (radv_info->adv_prefixes_pool, prefix); + pi = prefix - radv_info->adv_prefixes_pool; + mhash_set (&radv_info->address_to_prefix_index, prefix_addr, pi, /* old_value */ 0); + + memset(prefix, 0x0, sizeof(ip6_radv_prefix_t)); + + prefix->prefix_len = prefix_len; + memcpy(&prefix->prefix, prefix_addr, sizeof(ip6_address_t)); + + /* initialize default values */ + prefix->adv_on_link_flag = 1; /* L bit set */ + prefix->adv_autonomous_flag = 1; /* A bit set */ + prefix->adv_valid_lifetime_in_secs = DEF_ADV_VALID_LIFETIME; + prefix->adv_pref_lifetime_in_secs = DEF_ADV_PREF_LIFETIME; + prefix->enabled = 1; + prefix->decrement_lifetime_flag = 1; + prefix->deprecated_prefix_flag = 1; + + if(off_link == 0) + { + /* FIXME - Should the DP do this or the CP ?*/ + /* insert prefix into routing table as a connected prefix */ + } + + if(use_default) + goto restart; + } + else + { + + if(prefix->prefix_len != prefix_len) + return VNET_API_ERROR_INVALID_VALUE_2; + + if(off_link != 0) + { + /* FIXME - Should the DP do this or the CP ?*/ + /* remove from routing table if already there */ + } + } + + if((val_lifetime == ~0) || (pref_lifetime == ~0)) + { + prefix->adv_valid_lifetime_in_secs = ~0; + prefix->adv_pref_lifetime_in_secs = ~0; + prefix->decrement_lifetime_flag = 0; + } + else + { + prefix->adv_valid_lifetime_in_secs = val_lifetime;; + prefix->adv_pref_lifetime_in_secs = pref_lifetime; + } + + /* copy remaining */ + prefix->enabled = !(no_advertise != 0); + prefix->adv_on_link_flag = !((off_link != 0) || (no_onlink != 0)); + prefix->adv_autonomous_flag = !(no_autoconfig != 0); + + restart: + /* restart */ + /* fill in the expiration times */ + prefix->valid_lifetime_expires = now + prefix->adv_valid_lifetime_in_secs; + prefix->pref_lifetime_expires = now + prefix->adv_pref_lifetime_in_secs; + + radv_info->initial_adverts_sent = radv_info->initial_adverts_count -1; + radv_info->next_multicast_time = vlib_time_now (vm); + radv_info->last_multicast_time = vlib_time_now (vm); + radv_info->last_radv_time = 0; + } + return(error); +} + +clib_error_t * +ip6_neighbor_cmd(vlib_main_t * vm, unformat_input_t * main_input, vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + clib_error_t * error = 0; + u8 is_no = 0; + u8 surpress = 0, managed = 0, other = 0; + u8 surpress_ll_option = 0, send_unicast = 0, cease= 0; + u8 use_lifetime = 0; + u32 sw_if_index, ra_lifetime = 0, ra_initial_count = 0, ra_initial_interval = 0; + u32 ra_max_interval = 0 , ra_min_interval = 0; + + unformat_input_t _line_input, * line_input = &_line_input; + vnet_sw_interface_t * sw_if0; + + int add_radv_info = 1; + __attribute__((unused)) ip6_radv_t * radv_info = 0; + ip6_address_t ip6_addr; + u32 addr_len; + + + /* Get a line of input. */ + if (! unformat_user (main_input, unformat_line_input, line_input)) + return 0; + + /* get basic radv info for this interface */ + if(unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + + if (unformat_user (line_input, + unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + u32 ri; + ethernet_interface_t * eth_if0 = 0; + + sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index); + if(sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE) + eth_if0 = ethernet_get_interface (ðernet_main, sw_if0->hw_if_index); + + if(!eth_if0) + { + error = clib_error_return (0, "Interface must be of ethernet type"); + goto done; + } + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + + if(ri != ~0) + { + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + } + else + { + error = clib_error_return (0, "unknown interface %U'", + format_unformat_error, line_input); + goto done; + } + } + else + { + error = clib_error_return (0, "invalid interface name %U'", + format_unformat_error, line_input); + goto done; + } + } + + /* get the rest of the command */ + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "no")) + is_no = 1; + else if(unformat (line_input, "prefix %U/%d", + unformat_ip6_address, &ip6_addr, + &addr_len)) + { + add_radv_info = 0; + break; + } + else if (unformat (line_input, "ra-managed-config-flag")) + { + managed = 1; + break; + } + else if (unformat (line_input, "ra-other-config-flag")) + { + other = 1; + break; + } + else if (unformat (line_input, "ra-surpress")) + { + surpress = 1; + break; + } + else if (unformat (line_input, "ra-surpress-link-layer")) + { + surpress_ll_option = 1; + break; + } + else if (unformat (line_input, "ra-send-unicast")) + { + send_unicast = 1; + break; + } + else if (unformat (line_input, "ra-lifetime")) + { + if (!unformat (line_input, "%d", &ra_lifetime)) + return(error = unformat_parse_error (line_input)); + use_lifetime = 1; + break; + } + else if (unformat (line_input, "ra-initial")) + { + if (!unformat (line_input, "%d %d", &ra_initial_count, &ra_initial_interval)) + return(error = unformat_parse_error (line_input)); + break; + } + else if (unformat (line_input, "ra-interval")) + { + if (!unformat (line_input, "%d", &ra_max_interval)) + return(error = unformat_parse_error (line_input)); + + if (!unformat (line_input, "%d", &ra_min_interval)) + ra_min_interval = 0; + break; + } + else if(unformat (line_input, "ra-cease")) + { + cease = 1; + break; + } + else + return(unformat_parse_error (line_input)); + } + + if(add_radv_info) + { + ip6_neighbor_ra_config(vm, sw_if_index, + surpress, managed, other, + surpress_ll_option, send_unicast, cease, + use_lifetime, ra_lifetime, + ra_initial_count, ra_initial_interval, + ra_max_interval, ra_min_interval, + is_no); + } + else + { + u32 valid_lifetime_in_secs = 0; + u32 pref_lifetime_in_secs = 0; + u8 use_prefix_default_values = 0; + u8 no_advertise = 0; + u8 off_link= 0; + u8 no_autoconfig = 0; + u8 no_onlink= 0; + + /* get the rest of the command */ + while(unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if(unformat (line_input, "default")) + { + use_prefix_default_values = 1; + break; + } + else if(unformat (line_input, "infinite")) + { + valid_lifetime_in_secs = ~0; + pref_lifetime_in_secs = ~0; + break; + } + else if(unformat (line_input, "%d %d", &valid_lifetime_in_secs, + &pref_lifetime_in_secs)) + break; + else + break; + } + + + /* get the rest of the command */ + while (!use_prefix_default_values && + unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if(unformat (line_input, "no-advertise")) + no_advertise = 1; + else if(unformat (line_input, "off-link")) + off_link = 1; + else if(unformat (line_input, "no-autoconfig")) + no_autoconfig = 1; + else if(unformat (line_input, "no-onlink")) + no_onlink = 1; + else + return(unformat_parse_error (line_input)); + } + + ip6_neighbor_ra_prefix(vm, sw_if_index, + &ip6_addr, addr_len, + use_prefix_default_values, + valid_lifetime_in_secs, + pref_lifetime_in_secs, + no_advertise, + off_link, + no_autoconfig, + no_onlink, + is_no); + } + + unformat_free (line_input); + + done: + return error; +} + +static clib_error_t * +show_ip6_interface_cmd (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + clib_error_t * error = 0; + u32 sw_if_index; + + sw_if_index = ~0; + + if (unformat_user (input, + unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + u32 ri; + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + + if(ri != ~0) + { + ip_lookup_main_t * lm = &ip6_main.lookup_main; + ip6_radv_t * radv_info; + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + vlib_cli_output (vm, "%U is admin %s\n", format_vnet_sw_interface_name, vnm, + vnet_get_sw_interface (vnm, sw_if_index), + (vnet_sw_interface_is_admin_up (vnm, sw_if_index) ? "up" : "down")); + + u32 ai; + u32 *global_scope = 0,i; + ip_interface_address_t * a; + + vec_validate_init_empty (lm->if_address_pool_index_by_sw_if_index, sw_if_index, ~0); + ai = lm->if_address_pool_index_by_sw_if_index[sw_if_index]; + + while (ai != (u32)~0) + { + a = pool_elt_at_index(lm->if_address_pool, ai); + ip6_address_t * address = ip_interface_address_get_address (lm, a); + + if( ip6_address_is_link_local_unicast (address)) + vlib_cli_output (vm, "\tIPv6 is enabled, link-local address is %U\n", format_ip6_address, + address); + + if((address->as_u8[0] & 0xe0) == 0x20) + vec_add1 (global_scope, ai); + + ai = a->next_this_sw_interface; + } + + vlib_cli_output (vm, "\tGlobal unicast address(es):\n"); + for (i = 0; i < vec_len (global_scope); i++) + { + a = pool_elt_at_index(lm->if_address_pool, global_scope[i]); + ip6_address_t * address = ip_interface_address_get_address (lm, a); + ip6_address_t mask, subnet; + + subnet = *address; + ip6_address_mask_from_width(&mask, a->address_length); + ip6_address_mask(&subnet, &mask); + + vlib_cli_output (vm, "\t\t%U, subnet is %U/%d", + format_ip6_address, address, + format_ip6_address,&subnet, + a->address_length); + } + vec_free (global_scope); + vlib_cli_output (vm, "\tJoined group address(es):\n"); + ip6_mldp_group_t *m; + pool_foreach (m, radv_info->mldp_group_pool, ({ + vlib_cli_output (vm, "\t\t%U\n", format_ip6_address, &m->mcast_address); + })); + + vlib_cli_output (vm, "\tAdvertised Prefixes:\n"); + ip6_radv_prefix_t * p; + pool_foreach (p, radv_info->adv_prefixes_pool, ({ + vlib_cli_output (vm, "\t\tprefix %U, length %d\n", + format_ip6_address, &p->prefix, p->prefix_len); + })); + + vlib_cli_output (vm, "\tMTU is %d\n", radv_info->adv_link_mtu); + vlib_cli_output (vm, "\tICMP error messages are unlimited\n"); + vlib_cli_output (vm, "\tICMP redirects are disabled\n"); + vlib_cli_output (vm, "\tICMP unreachables are not sent\n"); + vlib_cli_output (vm, "\tND DAD is disabled\n"); + //vlib_cli_output (vm, "\tND reachable time is %d milliseconds\n",); + vlib_cli_output (vm, "\tND advertised reachable time is %d\n", + radv_info->adv_neighbor_reachable_time_in_msec); + vlib_cli_output (vm, "\tND advertised retransmit interval is %d (msec)\n", + radv_info->adv_time_in_msec_between_retransmitted_neighbor_solicitations); + + u32 ra_interval = radv_info->max_radv_interval; + u32 ra_interval_min = radv_info->min_radv_interval; + vlib_cli_output (vm, "\tND router advertisements are sent every %d seconds (min interval is %d)\n", + ra_interval, ra_interval_min); + vlib_cli_output (vm, "\tND router advertisements live for %d seconds\n", + radv_info->adv_router_lifetime_in_sec); + vlib_cli_output (vm, "\tHosts %s stateless autoconfig for addresses\n", + (radv_info->adv_managed_flag) ? "use" :" don't use"); + vlib_cli_output (vm, "\tND router advertisements sent %d\n", radv_info->n_advertisements_sent); + vlib_cli_output (vm, "\tND router solicitations received %d\n", radv_info->n_solicitations_rcvd); + vlib_cli_output (vm, "\tND router solicitations dropped %d\n", radv_info->n_solicitations_dropped); + } + else + { + error = clib_error_return (0, "Ipv6 not enabled on interface", + format_unformat_error, input); + + } + } + return error; +} + +VLIB_CLI_COMMAND (show_ip6_interface_command, static) = { + .path = "show ip6 interface", + .function = show_ip6_interface_cmd, + .short_help = "Show ip6 interface <iface name>", +}; + +clib_error_t * +disable_ip6_interface(vlib_main_t * vm, + u32 sw_if_index) +{ + clib_error_t * error = 0; + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + u32 ri; + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + + /* if not created - do nothing */ + if(ri != ~0) + { + vnet_main_t * vnm = vnet_get_main(); + ip6_radv_t * radv_info; + + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + /* check radv_info ref count for other ip6 addresses on this interface */ + if(radv_info->ref_count == 0 ) + { + /* essentially "disables" ipv6 on this interface */ + error = ip6_add_del_interface_address (vm, sw_if_index, + &radv_info->link_local_address, + radv_info->link_local_prefix_len, + 1 /* is_del */); + + ip6_neighbor_sw_interface_add_del (vnm, sw_if_index, 0/* is_add */); + } + } + return error; +} + +int +ip6_interface_enabled(vlib_main_t * vm, + u32 sw_if_index) +{ + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + u32 ri = ~0; + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + + return ri != ~0; +} + +clib_error_t * +enable_ip6_interface(vlib_main_t * vm, + u32 sw_if_index) +{ + clib_error_t * error = 0; + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + u32 ri; + int is_add = 1; + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + + /* if not created yet */ + if(ri == ~0) + { + vnet_main_t * vnm = vnet_get_main(); + vnet_sw_interface_t * sw_if0; + + sw_if0 = vnet_get_sup_sw_interface (vnm, sw_if_index); + if(sw_if0->type == VNET_SW_INTERFACE_TYPE_HARDWARE) + { + ethernet_interface_t * eth_if0; + + eth_if0 = ethernet_get_interface (ðernet_main, sw_if0->hw_if_index); + if(eth_if0) + { + /* create radv_info. for this interface. This holds all the info needed for router adverts */ + ri = ip6_neighbor_sw_interface_add_del (vnm, sw_if_index, is_add); + + if(ri != ~0) + { + ip6_radv_t * radv_info; + ip6_address_t link_local_address; + + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + ip6_link_local_address_from_ethernet_mac_address (&link_local_address, + eth_if0->address); + + sw_if0 = vnet_get_sw_interface (vnm, sw_if_index); + if(sw_if0->type == VNET_SW_INTERFACE_TYPE_SUB) + { + /* make up an interface id */ + md5_context_t m; + u8 digest[16]; + + link_local_address.as_u64[0] = radv_info->randomizer; + + md5_init (&m); + md5_add (&m, &link_local_address, 16); + md5_finish (&m, digest); + + memcpy(&link_local_address, digest, 16); + + radv_info->randomizer = link_local_address.as_u64[0]; + + link_local_address.as_u64[0] = clib_host_to_net_u64 (0xFE80000000000000ULL); + /* clear u bit */ + link_local_address.as_u8[8] &= 0xfd; + } + + /* essentially "enables" ipv6 on this interface */ + error = ip6_add_del_interface_address (vm, sw_if_index, + &link_local_address, 64 /* address width */, + 0 /* is_del */); + + if(error) + ip6_neighbor_sw_interface_add_del (vnm, sw_if_index, !is_add); + else + { + radv_info->link_local_address = link_local_address; + radv_info->link_local_prefix_len = 64; + } + } + } + } + } + return error; +} + +static clib_error_t * +enable_ip6_interface_cmd (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * error = 0; + u32 sw_if_index; + + sw_if_index = ~0; + + if (unformat_user (input, + unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + enable_ip6_interface(vm, sw_if_index); + } + else + { + error = clib_error_return (0, "unknown interface\n'", + format_unformat_error, input); + + } + return error; +} + +VLIB_CLI_COMMAND (enable_ip6_interface_command, static) = { + .path = "enable ip6 interface", + .function = enable_ip6_interface_cmd, + .short_help = "enable ip6 interface <iface name>", +}; + +static clib_error_t * +disable_ip6_interface_cmd (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * error = 0; + u32 sw_if_index; + + sw_if_index = ~0; + + if (unformat_user (input, + unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + error = disable_ip6_interface(vm, sw_if_index); + } + else + { + error = clib_error_return (0, "unknown interface\n'", + format_unformat_error, input); + + } + return error; +} + +VLIB_CLI_COMMAND (disable_ip6_interface_command, static) = { + .path = "disable ip6 interface", + .function = disable_ip6_interface_cmd, + .short_help = "disable ip6 interface <iface name>", +}; + +VLIB_CLI_COMMAND (ip6_nd_command, static) = { + .path = "ip6 nd", + .short_help = "Set ip6 neighbor discovery parameters", + .function = ip6_neighbor_cmd, +}; + +clib_error_t * +set_ip6_link_local_address(vlib_main_t * vm, + u32 sw_if_index, + ip6_address_t *address, + u8 address_length) +{ + clib_error_t * error = 0; + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + u32 ri; + ip6_radv_t * radv_info; + vnet_main_t * vnm = vnet_get_main(); + + if( !ip6_address_is_link_local_unicast (address)) + { + vnm->api_errno = VNET_API_ERROR_ADDRESS_NOT_LINK_LOCAL; + return(error = clib_error_return (0, "address not link-local", + format_unformat_error)); + } + + /* call enable ipv6 */ + enable_ip6_interface(vm, sw_if_index); + + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + + if(ri != ~0) + { + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + /* save if link local address (overwrite default) */ + + /* delete the old one */ + error = ip6_add_del_interface_address (vm, sw_if_index, + &radv_info->link_local_address, + radv_info->link_local_prefix_len /* address width */, + 1 /* is_del */); + + if(!error) + { + /* add the new one */ + error = ip6_add_del_interface_address (vm, sw_if_index, + address , + address_length /* address width */, + 0/* is_del */); + + if(!error) + { + radv_info->link_local_address = *address; + radv_info->link_local_prefix_len = address_length; + } + } + } + else + { + vnm->api_errno = VNET_API_ERROR_IP6_NOT_ENABLED; + error = clib_error_return (0, "ip6 not enabled for interface", + format_unformat_error); + } + return error; +} + +clib_error_t * +set_ip6_link_local_address_cmd (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * error = 0; + u32 sw_if_index; + ip6_address_t ip6_addr; + u32 addr_len = 0; + + if (unformat_user (input, + unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + /* get the rest of the command */ + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if(unformat (input, "%U/%d", + unformat_ip6_address, &ip6_addr, + &addr_len)) + break; + else + return(unformat_parse_error (input)); + } + } + error = set_ip6_link_local_address(vm, + sw_if_index, + &ip6_addr, + addr_len); + return error; +} + +VLIB_CLI_COMMAND (set_ip6_link_local_address_command, static) = { + .path = "set ip6 link-local address", + .short_help = "Set ip6 interface link-local address <intfc> <address.>", + .function = set_ip6_link_local_address_cmd, +}; + +/* callback when an interface address is added or deleted */ +static void +ip6_neighbor_add_del_interface_address (ip6_main_t * im, + uword opaque, + u32 sw_if_index, + ip6_address_t * address, + u32 address_length, + u32 if_address_index, + u32 is_delete) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + u32 ri; + vlib_main_t * vm = vnm->vlib_main; + ip6_radv_t * radv_info; + ip6_address_t a; + ip6_mldp_group_t *mcast_group_info; + + /* create solicited node multicast address for this interface adddress */ + ip6_set_solicited_node_multicast_address (&a, 0); + + a.as_u8[0xd] = address->as_u8[0xd]; + a.as_u8[0xe] = address->as_u8[0xe]; + a.as_u8[0xf] = address->as_u8[0xf]; + + if(!is_delete) + { + /* try to create radv_info - does nothing if ipv6 already enabled */ + enable_ip6_interface(vm, sw_if_index); + + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + if(ri != ~0) + { + /* get radv_info */ + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + /* add address */ + if( !ip6_address_is_link_local_unicast (address)) + radv_info->ref_count++; + + /* lookup prefix info for this address on this interface */ + uword * p = mhash_get (&radv_info->address_to_mldp_index, &a); + mcast_group_info = p ? pool_elt_at_index (radv_info->mldp_group_pool, p[0]) : 0; + + /* add -solicted node multicast address */ + if(!mcast_group_info) + { + /* add */ + u32 mi; + pool_get (radv_info->mldp_group_pool, mcast_group_info); + + mi = mcast_group_info - radv_info->mldp_group_pool; + mhash_set (&radv_info->address_to_mldp_index, &a, mi, /* old_value */ 0); + + mcast_group_info->type = 4; + mcast_group_info->mcast_source_address_pool = 0; + mcast_group_info->num_sources = 0; + memcpy(&mcast_group_info->mcast_address, &a, sizeof(ip6_address_t)); + } + } + } + else + { + + /* delete */ + /* look up the radv_t information for this interface */ + vec_validate_init_empty (nm->if_radv_pool_index_by_sw_if_index, sw_if_index, ~0); + ri = nm->if_radv_pool_index_by_sw_if_index[sw_if_index]; + if(ri != ~0) + { + /* get radv_info */ + radv_info = pool_elt_at_index (nm->if_radv_pool, ri); + + /* lookup prefix info for this address on this interface */ + uword * p = mhash_get (&radv_info->address_to_mldp_index, &a); + mcast_group_info = p ? pool_elt_at_index (radv_info->mldp_group_pool, p[0]) : 0; + + if(mcast_group_info) + { + mhash_unset (&radv_info->address_to_mldp_index, &a,/* old_value */ 0); + pool_put (radv_info->mldp_group_pool, mcast_group_info); + } + + /* if interface up send MLDP "report" */ + radv_info->all_routers_mcast = 0; + + /* add address */ + if( !ip6_address_is_link_local_unicast (address)) + radv_info->ref_count--; + } + } +} + +clib_error_t *ip6_set_neighbor_limit (u32 neighbor_limit) +{ + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + + nm->limit_neighbor_cache_size = neighbor_limit; + return 0; +} + +static clib_error_t * ip6_neighbor_init (vlib_main_t * vm) +{ + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + ip6_main_t * im = &ip6_main; + + mhash_init (&nm->neighbor_index_by_key, + /* value size */ sizeof (uword), + /* key size */ sizeof (ip6_neighbor_key_t)); + + icmp6_register_type (vm, ICMP6_neighbor_solicitation, ip6_icmp_neighbor_solicitation_node.index); + icmp6_register_type (vm, ICMP6_neighbor_advertisement, ip6_icmp_neighbor_advertisement_node.index); + icmp6_register_type (vm, ICMP6_router_solicitation, ip6_icmp_router_solicitation_node.index); + icmp6_register_type (vm, ICMP6_router_advertisement, ip6_icmp_router_advertisement_node.index); + + /* handler node for ip6 neighbor discovery events and timers */ + vlib_register_node (vm, &ip6_icmp_neighbor_discovery_event_node); + + /* add call backs */ + ip6_add_del_interface_address_callback_t cb; + memset(&cb, 0x0, sizeof(ip6_add_del_interface_address_callback_t)); + + /* when an interface address changes... */ + cb.function = ip6_neighbor_add_del_interface_address; + cb.function_opaque = 0; + vec_add1 (im->add_del_interface_address_callbacks, cb); + + mhash_init (&nm->pending_resolutions_by_address, + /* value size */ sizeof (uword), + /* key size */ sizeof (ip6_address_t)); + + /* default, configurable */ + nm->limit_neighbor_cache_size = 50000; + +#if 0 + /* $$$$ Hack fix for today */ + vec_validate_init_empty + (im->discover_neighbor_next_index_by_hw_if_index, 32, 0 /* drop */); +#endif + + return 0; +} + +VLIB_INIT_FUNCTION (ip6_neighbor_init); + + +void vnet_register_ip6_neighbor_resolution_event (vnet_main_t * vnm, + void * address_arg, + uword node_index, + uword type_opaque, + uword data) +{ + ip6_neighbor_main_t * nm = &ip6_neighbor_main; + ip6_address_t * address = address_arg; + uword * p; + pending_resolution_t * pr; + + pool_get (nm->pending_resolutions, pr); + + pr->next_index = ~0; + pr->node_index = node_index; + pr->type_opaque = type_opaque; + pr->data = data; + + p = mhash_get (&nm->pending_resolutions_by_address, address); + if (p) + { + /* Insert new resolution at the head of the list */ + pr->next_index = p[0]; + mhash_unset (&nm->pending_resolutions_by_address, address, 0); + } + + mhash_set (&nm->pending_resolutions_by_address, address, + pr - nm->pending_resolutions, 0 /* old value */); +} + diff --git a/vnet/vnet/ip/ip6_packet.h b/vnet/vnet/ip/ip6_packet.h new file mode 100644 index 00000000000..9a52cf72586 --- /dev/null +++ b/vnet/vnet/ip/ip6_packet.h @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip6/packet.h: ip6 packet format + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip6_packet_h +#define included_ip6_packet_h + +typedef union { + u8 as_u8[16]; + u16 as_u16[8]; + u32 as_u32[4]; + u64 as_u64[2]; + uword as_uword[16 / sizeof (uword)]; +} ip6_address_t; + +/* Packed so that the mhash key doesn't include uninitialized pad bytes */ +typedef CLIB_PACKED (struct { + /* IP address must be first for ip_interface_address_get_address() to work */ + ip6_address_t ip6_addr; + u32 fib_index; +}) ip6_address_fib_t; + +always_inline void +ip6_addr_fib_init (ip6_address_fib_t * addr_fib, ip6_address_t * address, + u32 fib_index) +{ + addr_fib->ip6_addr.as_u64[0] = address->as_u64[0]; + addr_fib->ip6_addr.as_u64[1] = address->as_u64[1]; + addr_fib->fib_index = fib_index; +} + +/* Special addresses: + unspecified ::/128 + loopback ::1/128 + global unicast 2000::/3 + unique local unicast fc00::/7 + link local unicast fe80::/10 + multicast ff00::/8 + ietf reserved everything else. */ + +#define foreach_ip6_multicast_address_scope \ + _ (loopback, 0x1) \ + _ (link_local, 0x2) \ + _ (admin_local, 0x4) \ + _ (site_local, 0x5) \ + _ (organization_local, 0x8) \ + _ (global, 0xe) + +#define foreach_ip6_multicast_link_local_group_id \ + _ (all_hosts, 0x1) \ + _ (all_routers, 0x2) \ + _ (rip_routers, 0x9) \ + _ (eigrp_routers, 0xa) \ + _ (pim_routers, 0xd) \ + _ (mldv2_routers, 0x16) + +typedef enum { +#define _(f,n) IP6_MULTICAST_SCOPE_##f = n, + foreach_ip6_multicast_address_scope +#undef _ +} ip6_multicast_address_scope_t; + +typedef enum { +#define _(f,n) IP6_MULTICAST_GROUP_ID_##f = n, + foreach_ip6_multicast_link_local_group_id +#undef _ +} ip6_multicast_link_local_group_id_t; + +always_inline uword +ip6_address_is_multicast (ip6_address_t * a) +{ return a->as_u8[0] == 0xff; } + +always_inline void +ip6_set_reserved_multicast_address (ip6_address_t * a, + ip6_multicast_address_scope_t scope, + u16 id) +{ + a->as_u64[0] = a->as_u64[1] = 0; + a->as_u16[0] = clib_host_to_net_u16 (0xff00 | scope); + a->as_u16[7] = clib_host_to_net_u16 (id); +} + +always_inline void +ip6_set_solicited_node_multicast_address (ip6_address_t * a, u32 id) +{ + /* 0xff02::1:ffXX:XXXX. */ + a->as_u64[0] = a->as_u64[1] = 0; + a->as_u16[0] = clib_host_to_net_u16 (0xff02); + a->as_u8[11] = 1; + ASSERT ((id >> 24) == 0); + id |= 0xff << 24; + a->as_u32[3] = clib_host_to_net_u32 (id); +} + +always_inline void +ip6_link_local_address_from_ethernet_address (ip6_address_t * a, u8 * ethernet_address) +{ + a->as_u64[0] = a->as_u64[1] = 0; + a->as_u16[0] = clib_host_to_net_u16 (0xfe80); + /* Always set locally administered bit (6). */ + a->as_u8[0x8] = ethernet_address[0] | (1 << 6); + a->as_u8[0x9] = ethernet_address[1]; + a->as_u8[0xa] = ethernet_address[2]; + a->as_u8[0xb] = 0xff; + a->as_u8[0xc] = 0xfe; + a->as_u8[0xd] = ethernet_address[3]; + a->as_u8[0xe] = ethernet_address[4]; + a->as_u8[0xf] = ethernet_address[5]; +} + +always_inline void +ip6_multicast_ethernet_address (u8 * ethernet_address, u32 group_id) +{ + ethernet_address[0] = 0x33; + ethernet_address[1] = 0x33; + ethernet_address[2] = ((group_id >> 24) & 0xff); + ethernet_address[3] = ((group_id >> 16) & 0xff); + ethernet_address[4] = ((group_id >> 8) & 0xff); + ethernet_address[5] = ((group_id >> 0) & 0xff); +} + +always_inline uword +ip6_address_is_equal (ip6_address_t * a, ip6_address_t * b) +{ + int i; + for (i = 0; i < ARRAY_LEN (a->as_uword); i++) + if (a->as_uword[i] != b->as_uword[i]) + return 0; + return 1; +} + +always_inline uword +ip6_address_is_equal_masked (ip6_address_t * a, ip6_address_t * b, + ip6_address_t * mask) +{ + int i; + for (i = 0; i < ARRAY_LEN (a->as_uword); i++) + { + uword a_masked, b_masked; + a_masked = a->as_uword[i] & mask->as_uword[i]; + b_masked = b->as_uword[i] & mask->as_uword[i]; + + if (a_masked != b_masked) + return 0; + } + return 1; +} + +always_inline void +ip6_address_mask (ip6_address_t * a, ip6_address_t * mask) +{ + int i; + for (i = 0; i < ARRAY_LEN (a->as_uword); i++) + a->as_uword[i] &= mask->as_uword[i]; +} + +always_inline void +ip6_address_set_zero (ip6_address_t * a) +{ + int i; + for (i = 0; i < ARRAY_LEN (a->as_uword); i++) + a->as_uword[i] = 0; +} + +always_inline void +ip6_address_mask_from_width (ip6_address_t * a, u32 width) +{ + int i, byte, bit, bitnum; + ASSERT (width <= 128); + memset (a, 0, sizeof (a[0])); + for (i = 0; i < width; i++) + { + bitnum = (7 - (i & 7)); + byte = i / 8; + bit = 1<<bitnum; + a->as_u8[byte] |= bit; + } +} + +always_inline uword +ip6_address_is_zero (ip6_address_t * a) +{ + int i; + for (i = 0; i < ARRAY_LEN (a->as_uword); i++) + if (a->as_uword[i] != 0) + return 0; + return 1; +} + +/* Check for unspecified address ::0 */ +always_inline uword +ip6_address_is_unspecified (ip6_address_t * a) +{ return ip6_address_is_zero (a); } + +/* Check for loopback address ::1 */ +always_inline uword +ip6_address_is_loopback (ip6_address_t * a) +{ + uword is_loopback; + u8 save = a->as_u8[15]; + a->as_u8[15] = save ^ 1; + is_loopback = ip6_address_is_zero (a); + a->as_u8[15] = save; + return is_loopback; +} + +/* Check for link local unicast fe80::/10. */ +always_inline uword +ip6_address_is_link_local_unicast (ip6_address_t * a) +{ return a->as_u8[0] == 0xfe && (a->as_u8[1] & 0xc0) == 0x80; } + +/* Check for unique local unicast fc00::/7. */ +always_inline uword +ip6_address_is_local_unicast (ip6_address_t * a) +{ return (a->as_u8[0] & 0xfe) == 0xfc; } + +/* Check for solicited node multicast 0xff02::1:ff00:0/104 */ +always_inline uword +ip6_is_solicited_node_multicast_address (ip6_address_t * a) +{ + return (a->as_u32[0] == clib_host_to_net_u32 (0xff020000) + && a->as_u32[1] == 0 + && a->as_u32[2] == clib_host_to_net_u32 (1) + && a->as_u8[12] == 0xff); +} + +typedef struct { + /* 4 bit version, 8 bit traffic class and 20 bit flow label. */ + u32 ip_version_traffic_class_and_flow_label; + + /* Total packet length not including this header (but including + any extension headers if present). */ + u16 payload_length; + + /* Protocol for next header. */ + u8 protocol; + + /* Hop limit decremented by router at each hop. */ + u8 hop_limit; + + /* Source and destination address. */ + ip6_address_t src_address, dst_address; +} ip6_header_t; + +always_inline void * +ip6_next_header (ip6_header_t * i) +{ return (void *) (i + 1); } + +always_inline void +ip6_tcp_reply_x1 (ip6_header_t * ip0, tcp_header_t * tcp0) +{ + { + ip6_address_t src0, dst0; + + src0 = ip0->src_address; + dst0 = ip0->dst_address; + ip0->src_address = dst0; + ip0->dst_address = src0; + } + + { + u16 src0, dst0; + + src0 = tcp0->ports.src; + dst0 = tcp0->ports.dst; + tcp0->ports.src = dst0; + tcp0->ports.dst = src0; + } +} + +always_inline void +ip6_tcp_reply_x2 (ip6_header_t * ip0, ip6_header_t * ip1, + tcp_header_t * tcp0, tcp_header_t * tcp1) +{ + { + ip6_address_t src0, dst0, src1, dst1; + + src0 = ip0->src_address; + src1 = ip1->src_address; + dst0 = ip0->dst_address; + dst1 = ip1->dst_address; + ip0->src_address = dst0; + ip1->src_address = dst1; + ip0->dst_address = src0; + ip1->dst_address = src1; + } + + { + u16 src0, dst0, src1, dst1; + + src0 = tcp0->ports.src; + src1 = tcp1->ports.src; + dst0 = tcp0->ports.dst; + dst1 = tcp1->ports.dst; + tcp0->ports.src = dst0; + tcp1->ports.src = dst1; + tcp0->ports.dst = src0; + tcp1->ports.dst = src1; + } +} + + +typedef CLIB_PACKED (struct { + u8 data; +}) ip6_pad1_option_t; + +typedef CLIB_PACKED (struct { + u8 type; + u8 len; + u8 data[0]; +}) ip6_padN_option_t; + +typedef CLIB_PACKED (struct { +#define IP6_MLDP_ALERT_TYPE 0x5 + u8 type; + u8 len; + u16 value; +}) ip6_router_alert_option_t; + +typedef CLIB_PACKED (struct { + u8 next_hdr; + /* Length of this header plus option data in 8 byte units. */ + u8 n_data_u64s; + u8 data[0]; +}) ip6_hop_by_hop_ext_t; + +typedef CLIB_PACKED (struct { + u8 next_hdr; + u8 rsv; + u16 fragment_offset_and_more; + u32 identification; +}) ip6_frag_hdr_t; + +#define ip6_frag_hdr_offset(hdr) \ + (clib_net_to_host_u16((hdr)->fragment_offset_and_more) >> 3) + +#define ip6_frag_hdr_more(hdr) \ + (clib_net_to_host_u16((hdr)->fragment_offset_and_more) & 0x1) + +#define ip6_frag_hdr_offset_and_more(offset, more) \ + clib_host_to_net_u16(((offset) << 3) + !!(more)) + +#endif /* included_ip6_packet_h */ diff --git a/vnet/vnet/ip/ip6_pg.c b/vnet/vnet/ip/ip6_pg.c new file mode 100644 index 00000000000..2c3852765d4 --- /dev/null +++ b/vnet/vnet/ip/ip6_pg.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip6_pg: IP v4 packet-generator interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> +#include <vnet/pg/pg.h> + +static void +ip6_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, + u32 n_packets) +{ + vlib_main_t * vm = pg->vlib_main; + u32 ip_header_offset = g->start_byte_offset; + + while (n_packets >= 2) + { + u32 pi0, pi1; + vlib_buffer_t * p0, * p1; + ip6_header_t * ip0, * ip1; + + pi0 = packets[0]; + pi1 = packets[1]; + p0 = vlib_get_buffer (vm, pi0); + p1 = vlib_get_buffer (vm, pi1); + n_packets -= 2; + packets += 2; + + ip0 = (void *) (p0->data + ip_header_offset); + ip1 = (void *) (p1->data + ip_header_offset); + + ip0->payload_length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p0) - ip_header_offset - sizeof (ip0[0])); + ip1->payload_length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p1) - ip_header_offset - sizeof (ip1[0])); + } + + while (n_packets >= 1) + { + u32 pi0; + vlib_buffer_t * p0; + ip6_header_t * ip0; + + pi0 = packets[0]; + p0 = vlib_get_buffer (vm, pi0); + n_packets -= 1; + packets += 1; + + ip0 = (void *) (p0->data + ip_header_offset); + + ip0->payload_length = clib_host_to_net_u16 (vlib_buffer_length_in_chain (vm, p0) - ip_header_offset - sizeof (ip0[0])); + } +} + +typedef struct { + pg_edit_t ip_version; + pg_edit_t traffic_class; + pg_edit_t flow_label; + pg_edit_t payload_length; + pg_edit_t protocol; + pg_edit_t hop_limit; + pg_edit_t src_address, dst_address; +} pg_ip6_header_t; + +static inline void +pg_ip6_header_init (pg_ip6_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, ip6_header_t, f); + _ (payload_length); + _ (hop_limit); + _ (protocol); + _ (src_address); + _ (dst_address); +#undef _ + + /* Initialize bit fields. */ + pg_edit_init_bitfield (&p->ip_version, ip6_header_t, + ip_version_traffic_class_and_flow_label, + 28, 4); + pg_edit_init_bitfield (&p->traffic_class, ip6_header_t, + ip_version_traffic_class_and_flow_label, + 20, 8); + pg_edit_init_bitfield (&p->flow_label, ip6_header_t, + ip_version_traffic_class_and_flow_label, + 0, 20); +} + +uword +unformat_pg_ip6_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t * s = va_arg (*args, pg_stream_t *); + pg_ip6_header_t * p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (ip6_header_t), + &group_index); + pg_ip6_header_init (p); + + /* Defaults. */ + pg_edit_set_fixed (&p->ip_version, 6); + pg_edit_set_fixed (&p->traffic_class, 0); + pg_edit_set_fixed (&p->flow_label, 0); + pg_edit_set_fixed (&p->hop_limit, 64); + + p->payload_length.type = PG_EDIT_UNSPECIFIED; + + if (! unformat (input, "%U: %U -> %U", + unformat_pg_edit, + unformat_ip_protocol, &p->protocol, + unformat_pg_edit, + unformat_ip6_address, &p->src_address, + unformat_pg_edit, + unformat_ip6_address, &p->dst_address)) + goto error; + + /* Parse options. */ + while (1) + { + if (unformat (input, "version %U", + unformat_pg_edit, + unformat_pg_number, &p->ip_version)) + ; + + else if (unformat (input, "traffic-class %U", + unformat_pg_edit, + unformat_pg_number, &p->traffic_class)) + ; + + else if (unformat (input, "length %U", + unformat_pg_edit, + unformat_pg_number, &p->payload_length)) + ; + + else if (unformat (input, "hop-limit %U", + unformat_pg_edit, + unformat_pg_number, &p->hop_limit)) + ; + + /* Can't parse input: try next protocol level. */ + else + break; + } + + { + ip_main_t * im = &ip_main; + ip_protocol_t protocol; + ip_protocol_info_t * pi; + + pi = 0; + if (p->protocol.type == PG_EDIT_FIXED) + { + protocol = pg_edit_get_value (&p->protocol, PG_EDIT_LO); + pi = ip_get_protocol_info (im, protocol); + } + + if (pi && pi->unformat_pg_edit + && unformat_user (input, pi->unformat_pg_edit, s)) + ; + + else if (! unformat_user (input, unformat_pg_payload, s)) + goto error; + + if (p->payload_length.type == PG_EDIT_UNSPECIFIED + && s->min_packet_bytes == s->max_packet_bytes + && group_index + 1 < vec_len (s->edit_groups)) + { + pg_edit_set_fixed (&p->payload_length, + pg_edit_group_n_bytes (s, group_index) - sizeof (ip6_header_t)); + } + + p = pg_get_edit_group (s, group_index); + if (p->payload_length.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t * g = pg_stream_get_group (s, group_index); + g->edit_function = ip6_pg_edit_function; + } + + return 1; + } + + error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + diff --git a/vnet/vnet/ip/ip_checksum.c b/vnet/vnet/ip/ip_checksum.c new file mode 100644 index 00000000000..23e7889bc7e --- /dev/null +++ b/vnet/vnet/ip/ip_checksum.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip4/ip_checksum.c: ip/tcp/udp checksums + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +ip_csum_t +ip_incremental_checksum (ip_csum_t sum, void * _data, uword n_bytes) +{ + uword data = pointer_to_uword (_data); + ip_csum_t sum0, sum1; + + sum0 = 0; + sum1 = sum; + + /* Align data pointer to 64 bits. */ +#define _(t) \ +do { \ + if (n_bytes >= sizeof (t) \ + && sizeof (t) < sizeof (ip_csum_t) \ + && (data % (2 * sizeof (t))) != 0) \ + { \ + sum0 += * uword_to_pointer (data, t *); \ + data += sizeof (t); \ + n_bytes -= sizeof (t); \ + } \ +} while (0) + + _ (u8); + _ (u16); + if (BITS (ip_csum_t) > 32) + _ (u32); + +#undef _ + + { + ip_csum_t * d = uword_to_pointer (data, ip_csum_t *); + + while (n_bytes >= 2 * sizeof (d[0])) + { + sum0 = ip_csum_with_carry (sum0, d[0]); + sum1 = ip_csum_with_carry (sum1, d[1]); + d += 2; + n_bytes -= 2 * sizeof (d[0]); + } + + data = pointer_to_uword (d); + } + +#define _(t) \ +do { \ + if (n_bytes >= sizeof (t) && sizeof (t) <= sizeof (ip_csum_t)) \ + { \ + sum0 = ip_csum_with_carry (sum0, * uword_to_pointer (data, t *)); \ + data += sizeof (t); \ + n_bytes -= sizeof (t); \ + } \ +} while (0) + + if (BITS (ip_csum_t) > 32) + _ (u64); + _ (u32); + _ (u16); + _ (u8); + +#undef _ + + /* Combine even and odd sums. */ + sum0 = ip_csum_with_carry (sum0, sum1); + + return sum0; +} + +ip_csum_t +ip_csum_and_memcpy (ip_csum_t sum, void * dst, void * src, uword n_bytes) +{ + uword n_left, n_left_odd; + ip_csum_t * dst_even, * src_even; + ip_csum_t sum0 = sum, sum1; + + dst_even = uword_to_pointer + (pointer_to_uword (dst) &~ (sizeof (sum) - 1), + ip_csum_t *); + src_even = src; + + n_left = n_bytes; + if ((n_left_odd = dst - (void *) dst_even)) + { + u8 * d8 = dst, * s8 = src; + uword i, n_copy_odd; + + n_copy_odd = clib_min (n_left, n_left_odd); + + for (i = 0; i < n_copy_odd; i++) + d8[i] = s8[i]; + + if (n_copy_odd != n_left_odd) + return sum0; + + sum0 = ip_csum_with_carry (sum0, dst_even[0]); + dst_even += 1; + src_even = (void *) (src + n_copy_odd); + n_left -= n_left_odd; + } + + sum1 = 0; + while (n_left >= 2 * sizeof (dst_even[0])) + { + ip_csum_t dst0, dst1; + + dst0 = clib_mem_unaligned (&src_even[0], ip_csum_t); + dst1 = clib_mem_unaligned (&src_even[1], ip_csum_t); + + dst_even[0] = dst0; + dst_even[1] = dst1; + + dst_even += 2; + src_even += 2; + n_left -= 2 * sizeof (dst_even[0]); + + sum0 = ip_csum_with_carry (sum0, dst0); + sum1 = ip_csum_with_carry (sum1, dst1); + } + + if (n_left >= 1 * sizeof (dst_even[0])) + { + ip_csum_t dst0; + + dst0 = clib_mem_unaligned (&src_even[0], ip_csum_t); + + dst_even[0] = dst0; + + dst_even += 1; + src_even += 1; + n_left -= 1 * sizeof (dst_even[0]); + + sum0 = ip_csum_with_carry (sum0, dst0); + } + + if (n_left > 0) + { + u8 * d8 = dst, * s8 = src; + uword i; + for (i = 0; i < n_left; i++) + d8[i] = s8[i]; + } + + return ip_csum_with_carry (sum0, sum1); +} diff --git a/vnet/vnet/ip/ip_frag.c b/vnet/vnet/ip/ip_frag.c new file mode 100644 index 00000000000..22176187a9c --- /dev/null +++ b/vnet/vnet/ip/ip_frag.c @@ -0,0 +1,449 @@ +/*--------------------------------------------------------------------------- + * Copyright (c) 2009-2014 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *--------------------------------------------------------------------------- + */ +/* + * IPv4 Fragmentation Node + * + * + */ + +#include "ip_frag.h" + +#include <vnet/ip/ip.h> + + +typedef struct { + u8 ipv6; + u16 header_offset; + u16 mtu; + u8 next; + u16 n_fragments; +} ip_frag_trace_t; + +static u8 * format_ip_frag_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip_frag_trace_t * t = va_arg (*args, ip_frag_trace_t *); + s = format(s, "IPv%s offset: %u mtu: %u fragments: %u next: %s", + t->ipv6?"6":"4", + t->header_offset, t->mtu, t->n_fragments, node->next_node_names[t->next]); + return s; +} + +static u32 running_fragment_id; + +static void +ip4_frag_do_fragment(vlib_main_t *vm, u32 pi, u32 **buffer, ip_frag_error_t *error) +{ + vlib_buffer_t *p; + ip4_header_t *ip4; + u16 mtu, ptr, len, max, rem, + offset, ip_frag_id, ip_frag_offset; + u8 *packet, more; + + vec_add1(*buffer, pi); + p = vlib_get_buffer(vm, pi); + offset = vnet_buffer(p)->ip_frag.header_offset; + mtu = vnet_buffer(p)->ip_frag.mtu; + packet = (u8 *)vlib_buffer_get_current(p); + ip4 = (ip4_header_t *)(packet + offset); + + rem = clib_net_to_host_u16(ip4->length) - sizeof(*ip4); + ptr = 0; + max = (mtu - sizeof(*ip4) - vnet_buffer(p)->ip_frag.header_offset) & ~0x7; + + if (rem < (p->current_length - offset - sizeof(*ip4))) { + *error = IP_FRAG_ERROR_MALFORMED; + return; + } + + if (mtu < sizeof(*ip4)) { + *error = IP_FRAG_ERROR_CANT_FRAGMENT_HEADER; + return; + } + + if (ip4->flags_and_fragment_offset & + clib_host_to_net_u16(IP4_HEADER_FLAG_DONT_FRAGMENT)) { + *error = IP_FRAG_ERROR_DONT_FRAGMENT_SET; + return; + } + + if (ip4_is_fragment(ip4)) { + ip_frag_id = ip4->fragment_id; + ip_frag_offset = ip4_get_fragment_offset(ip4); + more = !!(ip4->flags_and_fragment_offset & clib_host_to_net_u16(IP4_HEADER_FLAG_MORE_FRAGMENTS)); + } else { + ip_frag_id = (++running_fragment_id); + ip_frag_offset = 0; + more = 0; + } + + //Do the actual fragmentation + while (rem) { + u32 bi; + vlib_buffer_t *b; + ip4_header_t *fip4; + + len = (rem > (mtu - sizeof(*ip4) - vnet_buffer(p)->ip_frag.header_offset)) ? max : rem; + + if (ptr == 0) { + bi = pi; + b = p; + fip4 = (ip4_header_t *)(vlib_buffer_get_current(b) + offset); + } else { + if (!vlib_buffer_alloc(vm, &bi, 1)) { + *error = IP_FRAG_ERROR_MEMORY; + return; + } + vec_add1(*buffer, bi); + b = vlib_get_buffer(vm, bi); + vnet_buffer(b)->sw_if_index[VLIB_RX] = vnet_buffer(p)->sw_if_index[VLIB_RX]; + vnet_buffer(b)->sw_if_index[VLIB_TX] = vnet_buffer(p)->sw_if_index[VLIB_TX]; + fip4 = (ip4_header_t *)(vlib_buffer_get_current(b) + offset); + + //Copy offset and ip4 header + memcpy(b->data, packet, offset + sizeof(*ip4)); + //Copy data + memcpy(((u8*)(fip4)) + sizeof(*fip4), + packet + offset + sizeof(*fip4) + ptr, len); + } + b->current_length = offset + len + sizeof(*fip4); + + fip4->fragment_id = ip_frag_id; + fip4->flags_and_fragment_offset = clib_host_to_net_u16((ptr >> 3) + ip_frag_offset); + fip4->flags_and_fragment_offset |= clib_host_to_net_u16(((len != rem) || more) << 13); + // ((len0 != rem0) || more0) << 13 is optimization for + // ((len0 != rem0) || more0) ? IP4_HEADER_FLAG_MORE_FRAGMENTS : 0 + fip4->length = clib_host_to_net_u16(len + sizeof(*fip4)); + fip4->checksum = ip4_header_checksum(fip4); + + if(vnet_buffer(p)->ip_frag.flags & IP_FRAG_FLAG_IP4_HEADER) { + //Encapsulating ipv4 header + ip4_header_t *encap_header4 = (ip4_header_t *)vlib_buffer_get_current(b); + encap_header4->length = clib_host_to_net_u16(b->current_length); + encap_header4->checksum = ip4_header_checksum(encap_header4); + } else if (vnet_buffer(p)->ip_frag.flags & IP_FRAG_FLAG_IP6_HEADER) { + //Encapsulating ipv6 header + ip6_header_t *encap_header6 = (ip6_header_t *)vlib_buffer_get_current(b); + encap_header6->payload_length = clib_host_to_net_u16(b->current_length - sizeof(*encap_header6)); + } + + rem -= len; + ptr += len; + } +} + + +static uword +ip4_frag (vlib_main_t *vm, + vlib_node_runtime_t *node, + vlib_frame_t *frame) +{ + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + vlib_node_runtime_t * error_node = vlib_node_get_runtime(vm, ip4_frag_node.index); + from = vlib_frame_vector_args(frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + u32 frag_sent = 0, small_packets = 0; + u32 *buffer = 0; + + while (n_left_from > 0) { + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) { + u32 pi0, *frag_from, frag_left; + vlib_buffer_t *p0; + ip_frag_error_t error0; + ip4_frag_next_t next0; + + //Note: The packet is not enqueued now. + //It is instead put in a vector where other fragments + //will be put as well. + pi0 = from[0]; + from += 1; + n_left_from -= 1; + error0 = IP_FRAG_ERROR_NONE; + + p0 = vlib_get_buffer(vm, pi0); + ip4_frag_do_fragment(vm, pi0, &buffer, &error0); + + if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) { + ip_frag_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof (*tr)); + tr->header_offset = vnet_buffer(p0)->ip_frag.header_offset; + tr->mtu = vnet_buffer(p0)->ip_frag.mtu; + tr->ipv6 = 0; + tr->n_fragments = vec_len(buffer); + tr->next = vnet_buffer(p0)->ip_frag.next_index; + } + + next0 = (error0 == IP_FRAG_ERROR_NONE) ? vnet_buffer(p0)->ip_frag.next_index : IP4_FRAG_NEXT_DROP; + frag_sent += vec_len(buffer); + small_packets += (vec_len(buffer) == 1); + + //Send fragments that were added in the frame + frag_from = buffer; + frag_left = vec_len(buffer); + while (frag_left > 0) { + while (frag_left > 0 && n_left_to_next > 0) { + u32 i; + i = to_next[0] = frag_from[0]; + frag_from += 1; + frag_left -= 1; + to_next += 1; + n_left_to_next -= 1; + + vlib_get_buffer(vm, i)->error = error_node->errors[error0]; + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, i, + next0); + } + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + } + vec_reset_length(buffer); + } + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + } + vec_free(buffer); + vlib_node_increment_counter(vm, ip4_frag_node.index, IP_FRAG_ERROR_FRAGMENT_SENT, frag_sent); + vlib_node_increment_counter(vm, ip4_frag_node.index, IP_FRAG_ERROR_SMALL_PACKET, small_packets); + + return frame->n_vectors; +} + + +static void +ip6_frag_do_fragment(vlib_main_t *vm, u32 pi, u32 **buffer, ip_frag_error_t *error) +{ + vlib_buffer_t *p; + ip6_header_t *ip6_hdr; + ip6_frag_hdr_t *frag_hdr; + u8 *payload, *next_header; + + p = vlib_get_buffer(vm, pi); + + //Parsing the IPv6 headers + ip6_hdr = vlib_buffer_get_current(p) + vnet_buffer(p)->ip_frag.header_offset; + payload = (u8 *)(ip6_hdr + 1); + next_header = &ip6_hdr->protocol; + if (*next_header == IP_PROTOCOL_IP6_HOP_BY_HOP_OPTIONS) { + next_header = payload; + payload += payload[1] * 8; + } + + if (*next_header == IP_PROTOCOL_IP6_DESTINATION_OPTIONS) { + next_header = payload; + payload += payload[1] * 8; + } + + if (*next_header == IP_PROTOCOL_IPV6_ROUTE) { + next_header = payload; + payload += payload[1] * 8; + } + + u8 has_more; + u16 initial_offset; + if (*next_header == IP_PROTOCOL_IPV6_FRAGMENTATION) { + //The fragmentation header is already there + frag_hdr = (ip6_frag_hdr_t *)payload; + has_more = ip6_frag_hdr_more(frag_hdr); + initial_offset = ip6_frag_hdr_offset(frag_hdr); + } else { + //Insert a fragmentation header in the packet + u8 nh = *next_header; + *next_header = IP_PROTOCOL_IPV6_FRAGMENTATION; + vlib_buffer_advance(p, -sizeof(*frag_hdr)); + u8 *start = vlib_buffer_get_current(p); + memmove(start, start + sizeof(*frag_hdr), payload - (start + sizeof(*frag_hdr))); + frag_hdr = (ip6_frag_hdr_t *)(payload - sizeof(*frag_hdr)); + frag_hdr->identification = ++running_fragment_id; + frag_hdr->next_hdr = nh; + frag_hdr->rsv = 0; + has_more = 0; + initial_offset = 0; + } + payload = (u8 *)(frag_hdr + 1); + + u16 headers_len = payload - (u8 *)vlib_buffer_get_current(p); + u16 max_payload = vnet_buffer(p)->ip_frag.mtu - headers_len; + u16 rem = p->current_length - headers_len; + u16 ptr = 0; + + if(max_payload < 8) { + *error = IP_FRAG_ERROR_CANT_FRAGMENT_HEADER; + return; + } + + while (rem) { + u32 bi; + vlib_buffer_t *b; + u16 len = (rem > max_payload)?(max_payload & ~0x7):rem; + rem -= len; + + if (ptr != 0) { + if (!vlib_buffer_alloc(vm, &bi, 1)) { + *error = IP_FRAG_ERROR_MEMORY; + return; + } + b = vlib_get_buffer(vm, bi); + vnet_buffer(b)->sw_if_index[VLIB_RX] = vnet_buffer(p)->sw_if_index[VLIB_RX]; + vnet_buffer(b)->sw_if_index[VLIB_TX] = vnet_buffer(p)->sw_if_index[VLIB_TX]; + memcpy(vlib_buffer_get_current(b), vlib_buffer_get_current(p), headers_len); + memcpy(vlib_buffer_get_current(b) + headers_len, payload + ptr, len); + frag_hdr = vlib_buffer_get_current(b) + headers_len - sizeof(*frag_hdr); + } else { + bi = pi; + b = vlib_get_buffer(vm, bi); + //frag_hdr already set here + } + + ip6_hdr = vlib_buffer_get_current(b) + vnet_buffer(p)->ip_frag.header_offset; + frag_hdr->fragment_offset_and_more = ip6_frag_hdr_offset_and_more(initial_offset + (ptr >> 3), (rem || has_more)); + b->current_length = headers_len + len; + ip6_hdr->payload_length = clib_host_to_net_u16(b->current_length - vnet_buffer(p)->ip_frag.header_offset - sizeof(*ip6_hdr)); + + if(vnet_buffer(p)->ip_frag.flags & IP_FRAG_FLAG_IP4_HEADER) { + //Encapsulating ipv4 header + ip4_header_t *encap_header4 = (ip4_header_t *)vlib_buffer_get_current(b); + encap_header4->length = clib_host_to_net_u16(b->current_length); + encap_header4->checksum = ip4_header_checksum(encap_header4); + } else if (vnet_buffer(p)->ip_frag.flags & IP_FRAG_FLAG_IP6_HEADER) { + //Encapsulating ipv6 header + ip6_header_t *encap_header6 = (ip6_header_t *)vlib_buffer_get_current(b); + encap_header6->payload_length = clib_host_to_net_u16(b->current_length - sizeof(*encap_header6)); + } + + vec_add1(*buffer, bi); + + ptr += len; + } +} + +static uword +ip6_frag (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + u32 n_left_from, *from, next_index, *to_next, n_left_to_next; + vlib_node_runtime_t * error_node = vlib_node_get_runtime (vm, ip6_frag_node.index); + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + next_index = node->cached_next_index; + u32 frag_sent = 0, small_packets = 0; + u32 *buffer = 0; + + while (n_left_from > 0) { + vlib_get_next_frame (vm, node, next_index, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) { + u32 pi0, *frag_from, frag_left; + vlib_buffer_t * p0; + ip_frag_error_t error0; + ip6_frag_next_t next0; + + pi0 = from[0]; + from += 1; + n_left_from -= 1; + error0 = IP_FRAG_ERROR_NONE; + + p0 = vlib_get_buffer(vm, pi0); + ip6_frag_do_fragment(vm, pi0, &buffer, &error0); + + if (PREDICT_FALSE(p0->flags & VLIB_BUFFER_IS_TRACED)) { + ip_frag_trace_t *tr = vlib_add_trace(vm, node, p0, sizeof (*tr)); + tr->header_offset = vnet_buffer(p0)->ip_frag.header_offset; + tr->mtu = vnet_buffer(p0)->ip_frag.mtu; + tr->ipv6 = 1; + tr->n_fragments = vec_len(buffer); + tr->next = vnet_buffer(p0)->ip_frag.next_index; + } + + next0 = (error0 == IP_FRAG_ERROR_NONE) ? vnet_buffer(p0)->ip_frag.next_index : IP6_FRAG_NEXT_DROP; + frag_sent += vec_len(buffer); + small_packets += (vec_len(buffer) == 1); + + //Send fragments that were added in the frame + frag_from = buffer; + frag_left = vec_len(buffer); + while (frag_left > 0) { + while (frag_left > 0 && n_left_to_next > 0) { + u32 i; + i = to_next[0] = frag_from[0]; + frag_from += 1; + frag_left -= 1; + to_next += 1; + n_left_to_next -= 1; + + vlib_get_buffer(vm, i)->error = error_node->errors[error0]; + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, i, + next0); + } + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + vlib_get_next_frame(vm, node, next_index, to_next, n_left_to_next); + } + vec_reset_length(buffer); + } + vlib_put_next_frame(vm, node, next_index, n_left_to_next); + } + vec_free(buffer); + vlib_node_increment_counter(vm, ip6_frag_node.index, IP_FRAG_ERROR_FRAGMENT_SENT, frag_sent); + vlib_node_increment_counter(vm, ip6_frag_node.index, IP_FRAG_ERROR_SMALL_PACKET, small_packets); + + return frame->n_vectors; +} + +static char * ip4_frag_error_strings[] = { +#define _(sym,string) string, + foreach_ip_frag_error +#undef _ +}; + +VLIB_REGISTER_NODE (ip4_frag_node) = { + .function = ip4_frag, + .name = IP4_FRAG_NODE_NAME, + .vector_size = sizeof (u32), + .format_trace = format_ip_frag_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = IP_FRAG_N_ERROR, + .error_strings = ip4_frag_error_strings, + + .n_next_nodes = IP4_FRAG_N_NEXT, + .next_nodes = { + [IP4_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup", + [IP4_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup", + [IP4_FRAG_NEXT_DROP] = "error-drop" + }, +}; + +VLIB_REGISTER_NODE (ip6_frag_node) = { + .function = ip6_frag, + .name = IP6_FRAG_NODE_NAME, + .vector_size = sizeof (u32), + .format_trace = format_ip_frag_trace, + .type = VLIB_NODE_TYPE_INTERNAL, + + .n_errors = IP_FRAG_N_ERROR, + .error_strings = ip4_frag_error_strings, + + .n_next_nodes = IP6_FRAG_N_NEXT, + .next_nodes = { + [IP6_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup", + [IP6_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup", + [IP6_FRAG_NEXT_DROP] = "error-drop" + }, +}; diff --git a/vnet/vnet/ip/ip_frag.h b/vnet/vnet/ip/ip_frag.h new file mode 100644 index 00000000000..04566904e5f --- /dev/null +++ b/vnet/vnet/ip/ip_frag.h @@ -0,0 +1,81 @@ +/*--------------------------------------------------------------------------- + * Copyright (c) 2009-2014 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *--------------------------------------------------------------------------- + */ +/* + * IPv4 and IPv6 Fragmentation Nodes + * + * A packet sent to those nodes require the following + * buffer attributes to be set: + * ip_frag.header_offset : + * Where to find the IPv4 (or IPv6) header in the packet. Previous + * bytes are left untouched and copied in every fragment. The fragments + * are then appended. This option is used for fragmented packets + * that are encapsulated. + * ip_frag.mtu : + * Maximum size of IP packets, header included, but ignoring + * the 'ip_frag.header_offset' copied bytes. + * ip_frag.next_index : + * One of ip_frag_next_t, indicating to which exit node the fragments + * should be sent to. + * + */ + +#ifndef IP_FRAG_H +#define IP_FRAG_H + +#include <vnet/vnet.h> + +#define IP_FRAG_FLAG_IP4_HEADER 0x01 //Encapsulating IPv4 header +#define IP_FRAG_FLAG_IP6_HEADER 0x02 //Encapsulating IPv6 header + +#define IP4_FRAG_NODE_NAME "ip4-frag" +#define IP6_FRAG_NODE_NAME "ip6-frag" + +vlib_node_registration_t ip4_frag_node; +vlib_node_registration_t ip6_frag_node; + +typedef enum { + IP4_FRAG_NEXT_IP4_LOOKUP, + IP4_FRAG_NEXT_IP6_LOOKUP, + IP4_FRAG_NEXT_DROP, + IP4_FRAG_N_NEXT +} ip4_frag_next_t; + +typedef enum { + IP6_FRAG_NEXT_IP4_LOOKUP, + IP6_FRAG_NEXT_IP6_LOOKUP, + IP6_FRAG_NEXT_DROP, + IP6_FRAG_N_NEXT +} ip6_frag_next_t; + +#define foreach_ip_frag_error \ + /* Must be first. */ \ + _(NONE, "packet fragmented") \ + _(SMALL_PACKET, "packet smaller than MTU") \ + _(FRAGMENT_SENT, "number of sent fragments") \ + _(CANT_FRAGMENT_HEADER, "can't fragment header'") \ + _(DONT_FRAGMENT_SET, "can't fragment this packet'") \ + _(MALFORMED, "malformed packet") \ + _(MEMORY, "could not allocate buffer") \ + _(UNKNOWN, "unknown error") + +typedef enum { +#define _(sym,str) IP_FRAG_ERROR_##sym, + foreach_ip_frag_error +#undef _ + IP_FRAG_N_ERROR, + } ip_frag_error_t; + +#endif /* ifndef IP_FRAG_H */ diff --git a/vnet/vnet/ip/ip_init.c b/vnet/vnet/ip/ip_init.c new file mode 100644 index 00000000000..0654daa7685 --- /dev/null +++ b/vnet/vnet/ip/ip_init.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip_init.c: ip generic initialization + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +ip_main_t ip_main; + +clib_error_t * +ip_main_init (vlib_main_t * vm) +{ + ip_main_t * im = &ip_main; + clib_error_t * error = 0; + + memset (im, 0, sizeof (im[0])); + + { + ip_protocol_info_t * pi; + u32 i; + +#define ip_protocol(n,s) \ +do { \ + vec_add2 (im->protocol_infos, pi, 1); \ + pi->protocol = n; \ + pi->name = (u8 *) #s; \ +} while (0); + +#include "protocols.def" + +#undef ip_protocol + + im->protocol_info_by_name = hash_create_string (0, sizeof (uword)); + for (i = 0; i < vec_len (im->protocol_infos); i++) + { + pi = im->protocol_infos + i; + + hash_set_mem (im->protocol_info_by_name, pi->name, i); + hash_set (im->protocol_info_by_protocol, pi->protocol, i); + } + } + + { + tcp_udp_port_info_t * pi; + u32 i; + static char * port_names[] = + { +#define ip_port(s,n) #s, +#include "ports.def" +#undef ip_port + }; + static u16 ports[] = + { +#define ip_port(s,n) n, +#include "ports.def" +#undef ip_port + }; + + vec_resize (im->port_infos, ARRAY_LEN (port_names)); + im->port_info_by_name = hash_create_string (0, sizeof (uword)); + + for (i = 0; i < vec_len (im->port_infos); i++) + { + pi = im->port_infos + i; + pi->port = clib_host_to_net_u16 (ports[i]); + pi->name = (u8 *) port_names[i]; + hash_set_mem (im->port_info_by_name, pi->name, i); + hash_set (im->port_info_by_port, pi->port, i); + } + } + + if ((error = vlib_call_init_function (vm, vnet_main_init))) + return error; + + if ((error = vlib_call_init_function (vm, ip4_init))) + return error; + + if ((error = vlib_call_init_function (vm, ip6_init))) + return error; + + if ((error = vlib_call_init_function (vm, icmp4_init))) + return error; + + if ((error = vlib_call_init_function (vm, icmp6_init))) + return error; + + if ((error = vlib_call_init_function (vm, ip6_hop_by_hop_init))) + return error; + + if ((error = vlib_call_init_function (vm, ip4_hop_by_hop_init))) + return error; + +#if 0 + if ((error = vlib_call_init_function (vm, tcp_udp_lookup_init))) + return error; + +#endif + + if ((error = vlib_call_init_function (vm, udp_local_init))) + return error; + +#if 0 + if ((error = vlib_call_init_function (vm, tcp_init))) + return error; +#endif + + if ((error = vlib_call_init_function (vm, udp_init))) + return error; + + if ((error = vlib_call_init_function (vm, ip_classify_init))) + return error; + + if ((error = vlib_call_init_function (vm, input_acl_init))) + return error; + + return error; +} + +VLIB_INIT_FUNCTION (ip_main_init); diff --git a/vnet/vnet/ip/ip_input_acl.c b/vnet/vnet/ip/ip_input_acl.c new file mode 100644 index 00000000000..75aa9ef818f --- /dev/null +++ b/vnet/vnet/ip/ip_input_acl.c @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vnet/ip/ip.h> +#include <vnet/classify/vnet_classify.h> +#include <vnet/classify/input_acl.h> + +typedef struct { + u32 sw_if_index; + u32 next_index; + u32 table_index; + u32 offset; +} ip_inacl_trace_t; + +/* packet trace format function */ +static u8 * format_ip_inacl_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + ip_inacl_trace_t * t = va_arg (*args, ip_inacl_trace_t *); + + s = format (s, "INACL: sw_if_index %d, next_index %d, table %d, offset %d", + t->sw_if_index, t->next_index, t->table_index, t->offset); + return s; +} + +vlib_node_registration_t ip4_inacl_node; +vlib_node_registration_t ip6_inacl_node; + +#define foreach_ip_inacl_error \ +_(MISS, "input ACL misses") \ +_(HIT, "input ACL hits") \ +_(CHAIN_HIT, "input ACL hits after chain walk") + +typedef enum { +#define _(sym,str) IP_INACL_ERROR_##sym, + foreach_ip_inacl_error +#undef _ + IP_INACL_N_ERROR, +} ip_inacl_error_t; + +static char * ip_inacl_error_strings[] = { +#define _(sym,string) string, + foreach_ip_inacl_error +#undef _ +}; + +static inline uword +ip_inacl_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, int is_ip4) +{ + u32 n_left_from, * from, * to_next; + acl_next_index_t next_index; + input_acl_main_t * am = &input_acl_main; + vnet_classify_main_t * vcm = am->vnet_classify_main; + f64 now = vlib_time_now (vm); + u32 hits = 0; + u32 misses = 0; + u32 chain_hits = 0; + input_acl_table_id_t tid; + vlib_node_runtime_t * error_node; + + if (is_ip4) + { + tid = INPUT_ACL_TABLE_IP4; + error_node = vlib_node_get_runtime (vm, ip4_input_node.index); + } + else + { + tid = INPUT_ACL_TABLE_IP6; + error_node = vlib_node_get_runtime (vm, ip6_input_node.index); + } + + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + /* First pass: compute hashes */ + + while (n_left_from > 2) + { + vlib_buffer_t * b0, * b1; + u32 bi0, bi1; + u8 * h0, * h1; + u32 sw_if_index0, sw_if_index1; + u32 table_index0, table_index1; + vnet_classify_table_t * t0, * t1; + + /* prefetch next iteration */ + { + vlib_buffer_t * p1, * p2; + + p1 = vlib_get_buffer (vm, from[1]); + p2 = vlib_get_buffer (vm, from[2]); + + vlib_prefetch_buffer_header (p1, STORE); + CLIB_PREFETCH (p1->data, CLIB_CACHE_LINE_BYTES, STORE); + vlib_prefetch_buffer_header (p2, STORE); + CLIB_PREFETCH (p2->data, CLIB_CACHE_LINE_BYTES, STORE); + } + + bi0 = from[0]; + b0 = vlib_get_buffer (vm, bi0); + h0 = b0->data; + + bi1 = from[1]; + b1 = vlib_get_buffer (vm, bi1); + h1 = b1->data; + + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + table_index0 = am->classify_table_index_by_sw_if_index[tid][sw_if_index0]; + + sw_if_index1 = vnet_buffer (b1)->sw_if_index[VLIB_RX]; + table_index1 = am->classify_table_index_by_sw_if_index[tid][sw_if_index1]; + + t0 = pool_elt_at_index (vcm->tables, table_index0); + + t1 = pool_elt_at_index (vcm->tables, table_index1); + + vnet_buffer(b0)->l2_classify.hash = + vnet_classify_hash_packet (t0, (u8 *) h0); + + vnet_classify_prefetch_bucket (t0, vnet_buffer(b0)->l2_classify.hash); + + vnet_buffer(b1)->l2_classify.hash = + vnet_classify_hash_packet (t1, (u8 *) h1); + + vnet_classify_prefetch_bucket (t1, vnet_buffer(b1)->l2_classify.hash); + + vnet_buffer(b0)->l2_classify.table_index = table_index0; + + vnet_buffer(b1)->l2_classify.table_index = table_index1; + + from += 2; + n_left_from -= 2; + } + + while (n_left_from > 0) + { + vlib_buffer_t * b0; + u32 bi0; + u8 * h0; + u32 sw_if_index0; + u32 table_index0; + vnet_classify_table_t * t0; + + bi0 = from[0]; + b0 = vlib_get_buffer (vm, bi0); + h0 = b0->data; + + sw_if_index0 = vnet_buffer (b0)->sw_if_index[VLIB_RX]; + table_index0 = am->classify_table_index_by_sw_if_index[tid][sw_if_index0]; + + t0 = pool_elt_at_index (vcm->tables, table_index0); + vnet_buffer(b0)->l2_classify.hash = + vnet_classify_hash_packet (t0, (u8 *) h0); + + vnet_buffer(b0)->l2_classify.table_index = table_index0; + vnet_classify_prefetch_bucket (t0, vnet_buffer(b0)->l2_classify.hash); + + from++; + n_left_from--; + } + + next_index = node->cached_next_index; + from = vlib_frame_vector_args (frame); + n_left_from = frame->n_vectors; + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + /* Not enough load/store slots to dual loop... */ + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + u32 next0 = ACL_NEXT_INDEX_DENY; + u32 table_index0; + vnet_classify_table_t * t0; + vnet_classify_entry_t * e0; + u64 hash0; + u8 * h0; + u8 error0; + + /* Stride 3 seems to work best */ + if (PREDICT_TRUE (n_left_from > 3)) + { + vlib_buffer_t * p1 = vlib_get_buffer(vm, from[3]); + vnet_classify_table_t * tp1; + u32 table_index1; + u64 phash1; + + table_index1 = vnet_buffer(p1)->l2_classify.table_index; + + if (PREDICT_TRUE (table_index1 != ~0)) + { + tp1 = pool_elt_at_index (vcm->tables, table_index1); + phash1 = vnet_buffer(p1)->l2_classify.hash; + vnet_classify_prefetch_entry (tp1, phash1); + } + } + + /* speculatively enqueue b0 to the current next frame */ + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + h0 = b0->data; + table_index0 = vnet_buffer(b0)->l2_classify.table_index; + e0 = 0; + t0 = 0; + + vnet_get_config_data (am->vnet_config_main[tid], + &vnet_buffer(b0)->ip.current_config_index, + &next0, + /* # bytes of config data */ 0); + + if (PREDICT_TRUE(table_index0 != ~0)) + { + hash0 = vnet_buffer(b0)->l2_classify.hash; + t0 = pool_elt_at_index (vcm->tables, table_index0); + + e0 = vnet_classify_find_entry (t0, (u8 *) h0, hash0, + now); + if (e0) + { + vlib_buffer_advance (b0, e0->advance); + + next0 = (e0->next_index < ACL_NEXT_INDEX_N_NEXT)? + e0->next_index:next0; + + hits++; + + if (is_ip4) + error0 = (next0 == ACL_NEXT_INDEX_DENY)? + IP4_ERROR_INACL_SESSION_DENY:IP4_ERROR_NONE; + else + error0 = (next0 == ACL_NEXT_INDEX_DENY)? + IP6_ERROR_INACL_SESSION_DENY:IP6_ERROR_NONE; + b0->error = error_node->errors[error0]; + } + else + { + while (1) + { + if (PREDICT_TRUE(t0->next_table_index != ~0)) + t0 = pool_elt_at_index (vcm->tables, + t0->next_table_index); + else + { + next0 = (t0->miss_next_index < ACL_NEXT_INDEX_N_NEXT)? + t0->miss_next_index:next0; + + misses++; + + if (is_ip4) + error0 = (next0 == ACL_NEXT_INDEX_DENY)? + IP4_ERROR_INACL_TABLE_MISS:IP4_ERROR_NONE; + else + error0 = (next0 == ACL_NEXT_INDEX_DENY)? + IP6_ERROR_INACL_TABLE_MISS:IP6_ERROR_NONE; + b0->error = error_node->errors[error0]; + break; + } + + hash0 = vnet_classify_hash_packet (t0, (u8 *) h0); + e0 = vnet_classify_find_entry + (t0, (u8 *) h0, hash0, now); + if (e0) + { + vlib_buffer_advance (b0, e0->advance); + next0 = (e0->next_index < ACL_NEXT_INDEX_N_NEXT)? + e0->next_index:next0; + hits++; + chain_hits++; + + if (is_ip4) + error0 = (next0 == ACL_NEXT_INDEX_DENY)? + IP4_ERROR_INACL_SESSION_DENY:IP4_ERROR_NONE; + else + error0 = (next0 == ACL_NEXT_INDEX_DENY)? + IP6_ERROR_INACL_SESSION_DENY:IP6_ERROR_NONE; + b0->error = error_node->errors[error0]; + break; + } + } + } + } + + if (PREDICT_FALSE((node->flags & VLIB_NODE_FLAG_TRACE) + && (b0->flags & VLIB_BUFFER_IS_TRACED))) + { + ip_inacl_trace_t *t = + vlib_add_trace (vm, node, b0, sizeof (*t)); + t->sw_if_index = vnet_buffer(b0)->sw_if_index[VLIB_RX]; + t->next_index = next0; + t->table_index = t0 ? t0 - vcm->tables : ~0; + t->offset = e0 ? vnet_classify_get_offset (t0, e0): ~0; + } + + /* verify speculative enqueue, maybe switch current next frame */ + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + + vlib_node_increment_counter (vm, node->node_index, + IP_INACL_ERROR_MISS, + misses); + vlib_node_increment_counter (vm, node->node_index, + IP_INACL_ERROR_HIT, + hits); + vlib_node_increment_counter (vm, node->node_index, + IP_INACL_ERROR_CHAIN_HIT, + chain_hits); + return frame->n_vectors; +} + +static uword +ip4_inacl (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip_inacl_inline (vm, node, frame, 1 /* is_ip4 */); +} + + +VLIB_REGISTER_NODE (ip4_inacl_node) = { + .function = ip4_inacl, + .name = "ip4-inacl", + .vector_size = sizeof (u32), + .format_trace = format_ip_inacl_trace, + .n_errors = ARRAY_LEN(ip_inacl_error_strings), + .error_strings = ip_inacl_error_strings, + + .n_next_nodes = ACL_NEXT_INDEX_N_NEXT, + .next_nodes = { + [ACL_NEXT_INDEX_DENY] = "error-drop", + }, +}; + +static uword +ip6_inacl (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ + return ip_inacl_inline (vm, node, frame, 0 /* is_ip4 */); +} + + +VLIB_REGISTER_NODE (ip6_inacl_node) = { + .function = ip6_inacl, + .name = "ip6-inacl", + .vector_size = sizeof (u32), + .format_trace = format_ip_inacl_trace, + .n_errors = ARRAY_LEN(ip_inacl_error_strings), + .error_strings = ip_inacl_error_strings, + + .n_next_nodes = ACL_NEXT_INDEX_N_NEXT, + .next_nodes = { + [ACL_NEXT_INDEX_DENY] = "error-drop", + }, +}; + +static clib_error_t * +ip_inacl_init (vlib_main_t * vm) +{ + return 0; +} + +VLIB_INIT_FUNCTION (ip_inacl_init); + diff --git a/vnet/vnet/ip/ip_packet.h b/vnet/vnet/ip/ip_packet.h new file mode 100644 index 00000000000..fb9a23604e1 --- /dev/null +++ b/vnet/vnet/ip/ip_packet.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip_packet.h: packet format common between ip4 & ip6 + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip_packet_h +#define included_ip_packet_h + +#include <vppinfra/byte_order.h> +#include <vppinfra/error.h> + +typedef enum ip_protocol { +#define ip_protocol(n,s) IP_PROTOCOL_##s = n, +#include "protocols.def" +#undef ip_protocol +} ip_protocol_t; + +/* TCP/UDP ports. */ +typedef enum { +#define ip_port(s,n) IP_PORT_##s = n, +#include "ports.def" +#undef ip_port +} ip_port_t; + +/* Classifies protocols into TCP, UDP, ICMP or other. */ +typedef enum { + IP_BUILTIN_PROTOCOL_UDP, + IP_BUILTIN_PROTOCOL_TCP, + IP_BUILTIN_PROTOCOL_ICMP, + IP_BUILTIN_PROTOCOL_UNKNOWN, +} ip_builtin_protocol_t; + +#define foreach_ip_builtin_multicast_group \ + _ (1, all_hosts_on_subnet) \ + _ (2, all_routers_on_subnet) \ + _ (4, dvmrp) \ + _ (5, ospf_all_routers) \ + _ (6, ospf_designated_routers) \ + _ (13, pim) \ + _ (18, vrrp) \ + _ (102, hsrp) \ + _ (22, igmp_v3) + +typedef enum { +#define _(n,f) IP_MULTICAST_GROUP_##f = n, + foreach_ip_builtin_multicast_group +#undef _ +} ip_multicast_group_t; + +/* IP checksum support. */ + +/* Incremental checksum update. */ +typedef uword ip_csum_t; + +always_inline ip_csum_t +ip_csum_with_carry (ip_csum_t sum, ip_csum_t x) +{ + ip_csum_t t = sum + x; + return t + (t < x); +} + +/* Update checksum changing field at even byte offset from x -> 0. */ +always_inline ip_csum_t +ip_csum_add_even (ip_csum_t c, ip_csum_t x) +{ + ip_csum_t d; + + d = c - x; + + /* Fold in carry from high bit. */ + d -= d > c; + + ASSERT (ip_csum_with_carry (d, x) == c); + + return d; +} + +/* Update checksum changing field at even byte offset from 0 -> x. */ +always_inline ip_csum_t +ip_csum_sub_even (ip_csum_t c, ip_csum_t x) +{ return ip_csum_with_carry (c, x); } + +always_inline ip_csum_t +ip_csum_update_inline (ip_csum_t sum, ip_csum_t old, ip_csum_t new, + u32 field_byte_offset, u32 field_n_bytes) +{ + /* For even 1-byte fields on big-endian and odd 1-byte fields on little endian + we need to shift byte into place for checksum. */ + if ((field_n_bytes % 2) + && (field_byte_offset % 2) == CLIB_ARCH_IS_LITTLE_ENDIAN) + { + old = old << 8; + new = new << 8; + } + sum = ip_csum_sub_even (sum, old); + sum = ip_csum_add_even (sum, new); + return sum; +} + +#define ip_csum_update(sum,old,new,type,field) \ + ip_csum_update_inline ((sum), (old), (new), \ + STRUCT_OFFSET_OF (type, field), \ + STRUCT_SIZE_OF (type, field)) + +always_inline u16 ip_csum_fold (ip_csum_t c) +{ + /* Reduce to 16 bits. */ +#if uword_bits == 64 + c = (c & (ip_csum_t) 0xffffffff) + (c >> (ip_csum_t) 32); + c = (c & 0xffff) + (c >> 16); +#endif + + c = (c & 0xffff) + (c >> 16); + c = (c & 0xffff) + (c >> 16); + + return c; +} + +/* Copy data and checksum at the same time. */ +ip_csum_t ip_csum_and_memcpy (ip_csum_t sum, void * dst, void * src, uword n_bytes); + +always_inline u16 +ip_csum_and_memcpy_fold (ip_csum_t sum, void * dst) +{ + uword n_zero; + ip_csum_t * dst_even; + + dst_even = uword_to_pointer + (pointer_to_uword (dst) &~ (sizeof (sum) - 1), + ip_csum_t *); + + if ((n_zero = dst - (void *) dst_even)) + { + u8 * d8 = dst; + uword i; + + for (i = 0; i < n_zero; i++) + d8[i] = 0; + + sum = ip_csum_with_carry (sum, dst_even[0]); + } + + return ip_csum_fold (sum); +} + +/* Checksum routine. */ +ip_csum_t ip_incremental_checksum (ip_csum_t sum, void * data, uword n_bytes); + +#endif /* included_ip_packet_h */ diff --git a/vnet/vnet/ip/lookup.c b/vnet/vnet/ip/lookup.c new file mode 100644 index 00000000000..80f0a33e731 --- /dev/null +++ b/vnet/vnet/ip/lookup.c @@ -0,0 +1,2271 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip_lookup.c: ip4/6 adjacency and lookup table managment + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vppinfra/math.h> /* for fabs */ +#include <vnet/ip/ip.h> + +static void +ip_multipath_del_adjacency (ip_lookup_main_t * lm, u32 del_adj_index); + +always_inline void +ip_poison_adjacencies (ip_adjacency_t * adj, uword n_adj) +{ + if (CLIB_DEBUG > 0) + memset (adj, 0xfe, n_adj * sizeof (adj[0])); +} + +/* Create new block of given number of contiguous adjacencies. */ +ip_adjacency_t * +ip_add_adjacency (ip_lookup_main_t * lm, + ip_adjacency_t * copy_adj, + u32 n_adj, + u32 * adj_index_return) +{ + ip_adjacency_t * adj; + u32 ai, i, handle; + + ai = heap_alloc (lm->adjacency_heap, n_adj, handle); + adj = heap_elt_at_index (lm->adjacency_heap, ai); + + ip_poison_adjacencies (adj, n_adj); + + /* Validate adjacency counters. */ + vlib_validate_combined_counter (&lm->adjacency_counters, ai + n_adj - 1); + + for (i = 0; i < n_adj; i++) + { + /* Make sure certain fields are always initialized. */ + adj[i].rewrite_header.sw_if_index = ~0; + adj[i].explicit_fib_index = ~0; + adj[i].mcast_group_index = ~0; + adj[i].classify_table_index = ~0; + adj[i].saved_lookup_next_index = 0; + + if (copy_adj) + adj[i] = copy_adj[i]; + + adj[i].heap_handle = handle; + adj[i].n_adj = n_adj; + + /* Zero possibly stale counters for re-used adjacencies. */ + vlib_zero_combined_counter (&lm->adjacency_counters, ai + i); + } + + *adj_index_return = ai; + return adj; +} + +static void ip_del_adjacency2 (ip_lookup_main_t * lm, u32 adj_index, u32 delete_multipath_adjacency) +{ + ip_adjacency_t * adj; + uword handle; + + ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 1); + + adj = ip_get_adjacency (lm, adj_index); + handle = adj->heap_handle; + + if (delete_multipath_adjacency) + ip_multipath_del_adjacency (lm, adj_index); + + ip_poison_adjacencies (adj, adj->n_adj); + + heap_dealloc (lm->adjacency_heap, handle); +} + +void ip_del_adjacency (ip_lookup_main_t * lm, u32 adj_index) +{ ip_del_adjacency2 (lm, adj_index, /* delete_multipath_adjacency */ 1); } + +static int +next_hop_sort_by_weight (ip_multipath_next_hop_t * n1, + ip_multipath_next_hop_t * n2) +{ + int cmp = (int) n1->weight - (int) n2->weight; + return (cmp == 0 + ? (int) n1->next_hop_adj_index - (int) n2->next_hop_adj_index + : (cmp > 0 ? +1 : -1)); +} + +/* Given next hop vector is over-written with normalized one with sorted weights and + with weights corresponding to the number of adjacencies for each next hop. + Returns number of adjacencies in block. */ +static u32 ip_multipath_normalize_next_hops (ip_lookup_main_t * lm, + ip_multipath_next_hop_t * raw_next_hops, + ip_multipath_next_hop_t ** normalized_next_hops) +{ + ip_multipath_next_hop_t * nhs; + uword n_nhs, n_adj, n_adj_left, i; + f64 sum_weight, norm, error; + + n_nhs = vec_len (raw_next_hops); + ASSERT (n_nhs > 0); + if (n_nhs == 0) + return 0; + + /* Allocate enough space for 2 copies; we'll use second copy to save original weights. */ + nhs = *normalized_next_hops; + vec_validate (nhs, 2*n_nhs - 1); + + /* Fast path: 1 next hop in block. */ + n_adj = n_nhs; + if (n_nhs == 1) + { + nhs[0] = raw_next_hops[0]; + nhs[0].weight = 1; + _vec_len (nhs) = 1; + goto done; + } + + else if (n_nhs == 2) + { + int cmp = next_hop_sort_by_weight (&raw_next_hops[0], &raw_next_hops[1]) < 0; + + /* Fast sort. */ + nhs[0] = raw_next_hops[cmp]; + nhs[1] = raw_next_hops[cmp ^ 1]; + + /* Fast path: equal cost multipath with 2 next hops. */ + if (nhs[0].weight == nhs[1].weight) + { + nhs[0].weight = nhs[1].weight = 1; + _vec_len (nhs) = 2; + goto done; + } + } + else + { + memcpy (nhs, raw_next_hops, n_nhs * sizeof (raw_next_hops[0])); + qsort (nhs, n_nhs, sizeof (nhs[0]), (void *) next_hop_sort_by_weight); + } + + /* Find total weight to normalize weights. */ + sum_weight = 0; + for (i = 0; i < n_nhs; i++) + sum_weight += nhs[i].weight; + + /* In the unlikely case that all weights are given as 0, set them all to 1. */ + if (sum_weight == 0) + { + for (i = 0; i < n_nhs; i++) + nhs[i].weight = 1; + sum_weight = n_nhs; + } + + /* Save copies of all next hop weights to avoid being overwritten in loop below. */ + for (i = 0; i < n_nhs; i++) + nhs[n_nhs + i].weight = nhs[i].weight; + + /* Try larger and larger power of 2 sized adjacency blocks until we + find one where traffic flows to within 1% of specified weights. */ + for (n_adj = max_pow2 (n_nhs); ; n_adj *= 2) + { + error = 0; + + norm = n_adj / sum_weight; + n_adj_left = n_adj; + for (i = 0; i < n_nhs; i++) + { + f64 nf = nhs[n_nhs + i].weight * norm; /* use saved weights */ + word n = flt_round_nearest (nf); + + n = n > n_adj_left ? n_adj_left : n; + n_adj_left -= n; + error += fabs (nf - n); + nhs[i].weight = n; + } + + nhs[0].weight += n_adj_left; + + /* Less than 5% average error per adjacency with this size adjacency block? */ + if (error <= lm->multipath_next_hop_error_tolerance*n_adj) + { + /* Truncate any next hops with zero weight. */ + _vec_len (nhs) = i; + break; + } + } + + done: + /* Save vector for next call. */ + *normalized_next_hops = nhs; + return n_adj; +} + +always_inline uword +ip_next_hop_hash_key_from_handle (uword handle) +{ return 1 + 2*handle; } + +always_inline uword +ip_next_hop_hash_key_is_heap_handle (uword k) +{ return k & 1; } + +always_inline uword +ip_next_hop_hash_key_get_heap_handle (uword k) +{ + ASSERT (ip_next_hop_hash_key_is_heap_handle (k)); + return k / 2; +} + +static u32 +ip_multipath_adjacency_get (ip_lookup_main_t * lm, + ip_multipath_next_hop_t * raw_next_hops, + uword create_if_non_existent) +{ + uword * p; + u32 i, j, n_adj, adj_index, adj_heap_handle; + ip_adjacency_t * adj, * copy_adj; + ip_multipath_next_hop_t * nh, * nhs; + ip_multipath_adjacency_t * madj; + + n_adj = ip_multipath_normalize_next_hops (lm, raw_next_hops, &lm->next_hop_hash_lookup_key_normalized); + nhs = lm->next_hop_hash_lookup_key_normalized; + + /* Basic sanity. */ + ASSERT (n_adj >= vec_len (raw_next_hops)); + + /* Use normalized next hops to see if we've seen a block equivalent to this one before. */ + p = hash_get_mem (lm->multipath_adjacency_by_next_hops, nhs); + if (p) + return p[0]; + + if (! create_if_non_existent) + return 0; + + adj = ip_add_adjacency (lm, /* copy_adj */ 0, n_adj, &adj_index); + adj_heap_handle = adj[0].heap_handle; + + /* Fill in adjacencies in block based on corresponding next hop adjacencies. */ + i = 0; + vec_foreach (nh, nhs) + { + copy_adj = ip_get_adjacency (lm, nh->next_hop_adj_index); + for (j = 0; j < nh->weight; j++) + { + adj[i] = copy_adj[0]; + adj[i].heap_handle = adj_heap_handle; + adj[i].n_adj = n_adj; + i++; + } + } + + /* All adjacencies should have been initialized. */ + ASSERT (i == n_adj); + + vec_validate (lm->multipath_adjacencies, adj_heap_handle); + madj = vec_elt_at_index (lm->multipath_adjacencies, adj_heap_handle); + + madj->adj_index = adj_index; + madj->n_adj_in_block = n_adj; + madj->reference_count = 0; /* caller will set to one. */ + + madj->normalized_next_hops.count = vec_len (nhs); + madj->normalized_next_hops.heap_offset + = heap_alloc (lm->next_hop_heap, vec_len (nhs), + madj->normalized_next_hops.heap_handle); + memcpy (lm->next_hop_heap + madj->normalized_next_hops.heap_offset, + nhs, vec_bytes (nhs)); + + hash_set (lm->multipath_adjacency_by_next_hops, + ip_next_hop_hash_key_from_handle (madj->normalized_next_hops.heap_handle), + madj - lm->multipath_adjacencies); + + madj->unnormalized_next_hops.count = vec_len (raw_next_hops); + madj->unnormalized_next_hops.heap_offset + = heap_alloc (lm->next_hop_heap, vec_len (raw_next_hops), + madj->unnormalized_next_hops.heap_handle); + memcpy (lm->next_hop_heap + madj->unnormalized_next_hops.heap_offset, + raw_next_hops, vec_bytes (raw_next_hops)); + + ip_call_add_del_adjacency_callbacks (lm, adj_index, /* is_del */ 0); + + return adj_heap_handle; +} + +/* Returns 0 for next hop not found. */ +u32 +ip_multipath_adjacency_add_del_next_hop (ip_lookup_main_t * lm, + u32 is_del, + u32 old_mp_adj_index, + u32 next_hop_adj_index, + u32 next_hop_weight, + u32 * new_mp_adj_index) +{ + ip_multipath_adjacency_t * mp_old, * mp_new; + ip_multipath_next_hop_t * nh, * nhs, * hash_nhs; + u32 n_nhs, i_nh; + + mp_new = mp_old = 0; + n_nhs = 0; + i_nh = 0; + nhs = 0; + + /* If old multipath adjacency is valid, find requested next hop. */ + if (old_mp_adj_index < vec_len (lm->multipath_adjacencies) + && lm->multipath_adjacencies[old_mp_adj_index].normalized_next_hops.count > 0) + { + mp_old = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index); + + nhs = vec_elt_at_index (lm->next_hop_heap, mp_old->unnormalized_next_hops.heap_offset); + n_nhs = mp_old->unnormalized_next_hops.count; + + /* Linear search: ok since n_next_hops is small. */ + for (i_nh = 0; i_nh < n_nhs; i_nh++) + if (nhs[i_nh].next_hop_adj_index == next_hop_adj_index) + break; + + /* Given next hop not found. */ + if (i_nh >= n_nhs && is_del) + return 0; + } + + hash_nhs = lm->next_hop_hash_lookup_key; + if (hash_nhs) + _vec_len (hash_nhs) = 0; + + if (is_del) + { + if (n_nhs > 1) + { + /* Prepare lookup key for multipath with target next hop deleted. */ + if (i_nh > 0) + vec_add (hash_nhs, nhs + 0, i_nh); + if (i_nh + 1 < n_nhs) + vec_add (hash_nhs, nhs + i_nh + 1, n_nhs - (i_nh + 1)); + } + } + else /* it's an add. */ + { + /* If next hop is already there with the same weight, we have nothing to do. */ + if (i_nh < n_nhs && nhs[i_nh].weight == next_hop_weight) + { + new_mp_adj_index[0] = ~0; + goto done; + } + + /* Copy old next hops to lookup key vector. */ + if (n_nhs > 0) + vec_add (hash_nhs, nhs, n_nhs); + + if (i_nh < n_nhs) + { + /* Change weight of existing next hop. */ + nh = vec_elt_at_index (hash_nhs, i_nh); + } + else + { + /* Add a new next hop. */ + vec_add2 (hash_nhs, nh, 1); + nh->next_hop_adj_index = next_hop_adj_index; + } + + /* Set weight for added or old next hop. */ + nh->weight = next_hop_weight; + } + + if (vec_len (hash_nhs) > 0) + { + u32 tmp = ip_multipath_adjacency_get (lm, hash_nhs, + /* create_if_non_existent */ 1); + if (tmp != ~0) + mp_new = vec_elt_at_index (lm->multipath_adjacencies, tmp); + + /* Fetch again since pool may have moved. */ + if (mp_old) + mp_old = vec_elt_at_index (lm->multipath_adjacencies, old_mp_adj_index); + } + + new_mp_adj_index[0] = mp_new ? mp_new - lm->multipath_adjacencies : ~0; + + if (mp_new != mp_old) + { + if (mp_old) + { + ASSERT (mp_old->reference_count > 0); + mp_old->reference_count -= 1; + } + if (mp_new) + mp_new->reference_count += 1; + } + + if (mp_old && mp_old->reference_count == 0) + ip_multipath_adjacency_free (lm, mp_old); + + done: + /* Save key vector next call. */ + lm->next_hop_hash_lookup_key = hash_nhs; + + return 1; +} + +static void +ip_multipath_del_adjacency (ip_lookup_main_t * lm, u32 del_adj_index) +{ + ip_adjacency_t * adj = ip_get_adjacency (lm, del_adj_index); + ip_multipath_adjacency_t * madj, * new_madj; + ip_multipath_next_hop_t * nhs, * hash_nhs; + u32 i, n_nhs, madj_index, new_madj_index; + + if (adj->heap_handle >= vec_len (lm->multipath_adjacencies)) + return; + + vec_validate (lm->adjacency_remap_table, vec_len (lm->adjacency_heap) - 1); + + for (madj_index = 0; madj_index < vec_len (lm->multipath_adjacencies); madj_index++) + { + madj = vec_elt_at_index (lm->multipath_adjacencies, madj_index); + if (madj->n_adj_in_block == 0) + continue; + + nhs = heap_elt_at_index (lm->next_hop_heap, madj->unnormalized_next_hops.heap_offset); + n_nhs = madj->unnormalized_next_hops.count; + for (i = 0; i < n_nhs; i++) + if (nhs[i].next_hop_adj_index == del_adj_index) + break; + + /* del_adj_index not found in unnormalized_next_hops? We're done. */ + if (i >= n_nhs) + continue; + + new_madj = 0; + if (n_nhs > 1) + { + hash_nhs = lm->next_hop_hash_lookup_key; + if (hash_nhs) + _vec_len (hash_nhs) = 0; + if (i > 0) + vec_add (hash_nhs, nhs + 0, i); + if (i + 1 < n_nhs) + vec_add (hash_nhs, nhs + i + 1, n_nhs - (i + 1)); + + new_madj_index = ip_multipath_adjacency_get (lm, hash_nhs, /* create_if_non_existent */ 1); + + lm->next_hop_hash_lookup_key = hash_nhs; + + if (new_madj_index == madj_index) + continue; + + new_madj = vec_elt_at_index (lm->multipath_adjacencies, new_madj_index); + } + + lm->adjacency_remap_table[madj->adj_index] = new_madj ? 1 + new_madj->adj_index : ~0; + lm->n_adjacency_remaps += 1; + ip_multipath_adjacency_free (lm, madj); + } +} + +void +ip_multipath_adjacency_free (ip_lookup_main_t * lm, + ip_multipath_adjacency_t * a) +{ + hash_unset (lm->multipath_adjacency_by_next_hops, + ip_next_hop_hash_key_from_handle (a->normalized_next_hops.heap_handle)); + heap_dealloc (lm->next_hop_heap, a->normalized_next_hops.heap_handle); + heap_dealloc (lm->next_hop_heap, a->unnormalized_next_hops.heap_handle); + + ip_del_adjacency2 (lm, a->adj_index, a->reference_count == 0); + memset (a, 0, sizeof (a[0])); +} + +always_inline ip_multipath_next_hop_t * +ip_next_hop_hash_key_get_next_hops (ip_lookup_main_t * lm, uword k, + uword * n_next_hops) +{ + ip_multipath_next_hop_t * nhs; + uword n_nhs; + if (ip_next_hop_hash_key_is_heap_handle (k)) + { + uword handle = ip_next_hop_hash_key_get_heap_handle (k); + nhs = heap_elt_with_handle (lm->next_hop_heap, handle); + n_nhs = heap_len (lm->next_hop_heap, handle); + } + else + { + nhs = uword_to_pointer (k, ip_multipath_next_hop_t *); + n_nhs = vec_len (nhs); + } + *n_next_hops = n_nhs; + return nhs; +} + +static uword +ip_next_hop_hash_key_sum (hash_t * h, uword key0) +{ + ip_lookup_main_t * lm = uword_to_pointer (h->user, ip_lookup_main_t *); + ip_multipath_next_hop_t * k0; + uword n0; + + k0 = ip_next_hop_hash_key_get_next_hops (lm, key0, &n0); + return hash_memory (k0, n0 * sizeof (k0[0]), /* seed */ n0); +} + +static uword +ip_next_hop_hash_key_equal (hash_t * h, uword key0, uword key1) +{ + ip_lookup_main_t * lm = uword_to_pointer (h->user, ip_lookup_main_t *); + ip_multipath_next_hop_t * k0, * k1; + uword n0, n1; + + k0 = ip_next_hop_hash_key_get_next_hops (lm, key0, &n0); + k1 = ip_next_hop_hash_key_get_next_hops (lm, key1, &n1); + + return n0 == n1 && ! memcmp (k0, k1, n0 * sizeof (k0[0])); +} + +clib_error_t * +ip_interface_address_add_del (ip_lookup_main_t * lm, + u32 sw_if_index, + void * addr_fib, + u32 address_length, + u32 is_del, + u32 * result_if_address_index) +{ + vnet_main_t * vnm = vnet_get_main(); + ip_interface_address_t * a, * prev, * next; + uword * p = mhash_get (&lm->address_to_if_address_index, addr_fib); + + vec_validate_init_empty (lm->if_address_pool_index_by_sw_if_index, sw_if_index, ~0); + a = p ? pool_elt_at_index (lm->if_address_pool, p[0]) : 0; + + /* Verify given length. */ + if ((a && (address_length != a->address_length)) || (address_length == 0)) + { + vnm->api_errno = VNET_API_ERROR_ADDRESS_LENGTH_MISMATCH; + return clib_error_create + ( "%U wrong length (expected %d) for interface %U", + lm->format_address_and_length, addr_fib, + address_length, a? a->address_length : -1, + format_vnet_sw_if_index_name, vnm, sw_if_index); + } + + if (is_del) + { + if (!a) + { + vnet_sw_interface_t * si = vnet_get_sw_interface (vnm, sw_if_index); + vnm->api_errno = VNET_API_ERROR_ADDRESS_NOT_FOUND_FOR_INTERFACE; + return clib_error_create ("%U not found for interface %U", + lm->format_address_and_length, + addr_fib, address_length, + format_vnet_sw_interface_name, vnm, si); + } + + if (a->prev_this_sw_interface != ~0) + { + prev = pool_elt_at_index (lm->if_address_pool, a->prev_this_sw_interface); + prev->next_this_sw_interface = a->next_this_sw_interface; + } + if (a->next_this_sw_interface != ~0) + { + next = pool_elt_at_index (lm->if_address_pool, a->next_this_sw_interface); + next->prev_this_sw_interface = a->prev_this_sw_interface; + + if(a->prev_this_sw_interface == ~0) + lm->if_address_pool_index_by_sw_if_index[sw_if_index] = a->next_this_sw_interface; + } + + if ((a->next_this_sw_interface == ~0) && (a->prev_this_sw_interface == ~0)) + lm->if_address_pool_index_by_sw_if_index[sw_if_index] = ~0; + + mhash_unset (&lm->address_to_if_address_index, addr_fib, + /* old_value */ 0); + pool_put (lm->if_address_pool, a); + + if (result_if_address_index) + *result_if_address_index = ~0; + } + + else if (! a) + { + u32 pi; /* previous index */ + u32 ai; + u32 hi; /* head index */ + + pool_get (lm->if_address_pool, a); + memset (a, ~0, sizeof (a[0])); + ai = a - lm->if_address_pool; + + hi = pi = lm->if_address_pool_index_by_sw_if_index[sw_if_index]; + prev = 0; + while (pi != (u32)~0) + { + prev = pool_elt_at_index(lm->if_address_pool, pi); + pi = prev->next_this_sw_interface; + } + pi = prev ? prev - lm->if_address_pool : (u32)~0; + + a->address_key = mhash_set (&lm->address_to_if_address_index, + addr_fib, ai, /* old_value */ 0); + a->address_length = address_length; + a->sw_if_index = sw_if_index; + a->flags = 0; + a->prev_this_sw_interface = pi; + a->next_this_sw_interface = ~0; + if (prev) + prev->next_this_sw_interface = ai; + + lm->if_address_pool_index_by_sw_if_index[sw_if_index] = + (hi != ~0) ? hi : ai; + if (result_if_address_index) + *result_if_address_index = ai; + } + else + { + if (result_if_address_index) + *result_if_address_index = a - lm->if_address_pool; + } + + + return /* no error */ 0; +} + +void serialize_vec_ip_adjacency (serialize_main_t * m, va_list * va) +{ + ip_adjacency_t * a = va_arg (*va, ip_adjacency_t *); + u32 n = va_arg (*va, u32); + u32 i; + for (i = 0; i < n; i++) + { + serialize_integer (m, a[i].heap_handle, sizeof (a[i].heap_handle)); + serialize_integer (m, a[i].n_adj, sizeof (a[i].n_adj)); + serialize_integer (m, a[i].lookup_next_index, sizeof (a[i].lookup_next_index_as_int)); + switch (a[i].lookup_next_index) + { + case IP_LOOKUP_NEXT_LOCAL: + serialize_integer (m, a[i].if_address_index, sizeof (a[i].if_address_index)); + break; + + case IP_LOOKUP_NEXT_ARP: + serialize_integer (m, a[i].if_address_index, sizeof (a[i].if_address_index)); + serialize_integer (m, a[i].rewrite_header.sw_if_index, sizeof (a[i].rewrite_header.sw_if_index)); + break; + + case IP_LOOKUP_NEXT_REWRITE: + serialize (m, serialize_vnet_rewrite, &a[i].rewrite_header, sizeof (a[i].rewrite_data)); + break; + + default: + /* nothing else to serialize. */ + break; + } + } +} + +void unserialize_vec_ip_adjacency (serialize_main_t * m, va_list * va) +{ + ip_adjacency_t * a = va_arg (*va, ip_adjacency_t *); + u32 n = va_arg (*va, u32); + u32 i; + ip_poison_adjacencies (a, n); + for (i = 0; i < n; i++) + { + unserialize_integer (m, &a[i].heap_handle, sizeof (a[i].heap_handle)); + unserialize_integer (m, &a[i].n_adj, sizeof (a[i].n_adj)); + unserialize_integer (m, &a[i].lookup_next_index_as_int, sizeof (a[i].lookup_next_index_as_int)); + switch (a[i].lookup_next_index) + { + case IP_LOOKUP_NEXT_LOCAL: + unserialize_integer (m, &a[i].if_address_index, sizeof (a[i].if_address_index)); + break; + + case IP_LOOKUP_NEXT_ARP: + unserialize_integer (m, &a[i].if_address_index, sizeof (a[i].if_address_index)); + unserialize_integer (m, &a[i].rewrite_header.sw_if_index, sizeof (a[i].rewrite_header.sw_if_index)); + break; + + case IP_LOOKUP_NEXT_REWRITE: + unserialize (m, unserialize_vnet_rewrite, &a[i].rewrite_header, sizeof (a[i].rewrite_data)); + break; + + default: + /* nothing else to unserialize. */ + break; + } + } +} + +static void serialize_vec_ip_multipath_next_hop (serialize_main_t * m, va_list * va) +{ + ip_multipath_next_hop_t * nh = va_arg (*va, ip_multipath_next_hop_t *); + u32 n = va_arg (*va, u32); + u32 i; + for (i = 0; i < n; i++) + { + serialize_integer (m, nh[i].next_hop_adj_index, sizeof (nh[i].next_hop_adj_index)); + serialize_integer (m, nh[i].weight, sizeof (nh[i].weight)); + } +} + +static void unserialize_vec_ip_multipath_next_hop (serialize_main_t * m, va_list * va) +{ + ip_multipath_next_hop_t * nh = va_arg (*va, ip_multipath_next_hop_t *); + u32 n = va_arg (*va, u32); + u32 i; + for (i = 0; i < n; i++) + { + unserialize_integer (m, &nh[i].next_hop_adj_index, sizeof (nh[i].next_hop_adj_index)); + unserialize_integer (m, &nh[i].weight, sizeof (nh[i].weight)); + } +} + +static void serialize_vec_ip_multipath_adjacency (serialize_main_t * m, va_list * va) +{ + ip_multipath_adjacency_t * a = va_arg (*va, ip_multipath_adjacency_t *); + u32 n = va_arg (*va, u32); + u32 i; + for (i = 0; i < n; i++) + { +#define foreach_ip_multipath_adjacency_field \ + _ (adj_index) _ (n_adj_in_block) _ (reference_count) \ + _ (normalized_next_hops.count) \ + _ (normalized_next_hops.heap_offset) \ + _ (normalized_next_hops.heap_handle) \ + _ (unnormalized_next_hops.count) \ + _ (unnormalized_next_hops.heap_offset) \ + _ (unnormalized_next_hops.heap_handle) + +#define _(f) serialize_integer (m, a[i].f, sizeof (a[i].f)); + foreach_ip_multipath_adjacency_field; +#undef _ + } +} + +static void unserialize_vec_ip_multipath_adjacency (serialize_main_t * m, va_list * va) +{ + ip_multipath_adjacency_t * a = va_arg (*va, ip_multipath_adjacency_t *); + u32 n = va_arg (*va, u32); + u32 i; + for (i = 0; i < n; i++) + { +#define _(f) unserialize_integer (m, &a[i].f, sizeof (a[i].f)); + foreach_ip_multipath_adjacency_field; +#undef _ + } +} + +void serialize_ip_lookup_main (serialize_main_t * m, va_list * va) +{ + ip_lookup_main_t * lm = va_arg (*va, ip_lookup_main_t *); + + /* If this isn't true you need to call e.g. ip4_maybe_remap_adjacencies + to make it true. */ + ASSERT (lm->n_adjacency_remaps == 0); + + serialize (m, serialize_heap, lm->adjacency_heap, serialize_vec_ip_adjacency); + + serialize (m, serialize_heap, lm->next_hop_heap, serialize_vec_ip_multipath_next_hop); + vec_serialize (m, lm->multipath_adjacencies, serialize_vec_ip_multipath_adjacency); + + /* Adjacency counters (FIXME disabled for now). */ + if (0) + serialize (m, serialize_vlib_combined_counter_main, &lm->adjacency_counters, /* incremental */ 0); +} + +void unserialize_ip_lookup_main (serialize_main_t * m, va_list * va) +{ + ip_lookup_main_t * lm = va_arg (*va, ip_lookup_main_t *); + + unserialize (m, unserialize_heap, &lm->adjacency_heap, unserialize_vec_ip_adjacency); + unserialize (m, unserialize_heap, &lm->next_hop_heap, unserialize_vec_ip_multipath_next_hop); + vec_unserialize (m, &lm->multipath_adjacencies, unserialize_vec_ip_multipath_adjacency); + + /* Build hash table from unserialized data. */ + { + ip_multipath_adjacency_t * a; + + vec_foreach (a, lm->multipath_adjacencies) + { + if (a->n_adj_in_block > 0 && a->reference_count > 0) + hash_set (lm->multipath_adjacency_by_next_hops, + ip_next_hop_hash_key_from_handle (a->normalized_next_hops.heap_handle), + a - lm->multipath_adjacencies); + } + } + + /* Validate adjacency counters. */ + vlib_validate_combined_counter (&lm->adjacency_counters, + vec_len (lm->adjacency_heap) - 1); + + /* Adjacency counters (FIXME disabled for now). */ + if (0) + unserialize (m, unserialize_vlib_combined_counter_main, &lm->adjacency_counters, /* incremental */ 0); +} + +void ip_lookup_init (ip_lookup_main_t * lm, u32 is_ip6) +{ + ip_adjacency_t * adj; + + /* Hand-craft special miss adjacency to use when nothing matches in the + routing table. Same for drop adjacency. */ + adj = ip_add_adjacency (lm, /* template */ 0, /* n-adj */ 1, &lm->miss_adj_index); + adj->lookup_next_index = IP_LOOKUP_NEXT_MISS; + ASSERT (lm->miss_adj_index == IP_LOOKUP_MISS_ADJ_INDEX); + + adj = ip_add_adjacency (lm, /* template */ 0, /* n-adj */ 1, &lm->drop_adj_index); + adj->lookup_next_index = IP_LOOKUP_NEXT_DROP; + + adj = ip_add_adjacency (lm, /* template */ 0, /* n-adj */ 1, &lm->local_adj_index); + adj->lookup_next_index = IP_LOOKUP_NEXT_LOCAL; + adj->if_address_index = ~0; + + if (! lm->fib_result_n_bytes) + lm->fib_result_n_bytes = sizeof (uword); + + lm->multipath_adjacency_by_next_hops + = hash_create2 (/* elts */ 0, + /* user */ pointer_to_uword (lm), + /* value_bytes */ sizeof (uword), + ip_next_hop_hash_key_sum, + ip_next_hop_hash_key_equal, + /* format pair/arg */ + 0, 0); + + /* 1% max error tolerance for multipath. */ + lm->multipath_next_hop_error_tolerance = .01; + + lm->is_ip6 = is_ip6; + if (is_ip6) + { + lm->format_address_and_length = format_ip6_address_and_length; + mhash_init (&lm->address_to_if_address_index, sizeof (uword), + sizeof (ip6_address_fib_t)); + } + else + { + lm->format_address_and_length = format_ip4_address_and_length; + mhash_init (&lm->address_to_if_address_index, sizeof (uword), + sizeof (ip4_address_fib_t)); + } + + { + int i; + + /* Setup all IP protocols to be punted and builtin-unknown. */ + for (i = 0; i < 256; i++) + { + lm->local_next_by_ip_protocol[i] = IP_LOCAL_NEXT_PUNT; + lm->builtin_protocol_by_ip_protocol[i] = IP_BUILTIN_PROTOCOL_UNKNOWN; + } +#if 0 + /* Eliot's TCP doesn't actually work */ + lm->local_next_by_ip_protocol[IP_PROTOCOL_TCP] = IP_LOCAL_NEXT_TCP_LOOKUP; + lm->builtin_protocol_by_ip_protocol[IP_PROTOCOL_TCP] = + IP_BUILTIN_PROTOCOL_TCP; +#endif + + lm->local_next_by_ip_protocol[IP_PROTOCOL_UDP] = IP_LOCAL_NEXT_UDP_LOOKUP; + lm->local_next_by_ip_protocol[is_ip6 ? IP_PROTOCOL_ICMP6 : IP_PROTOCOL_ICMP] = IP_LOCAL_NEXT_ICMP; + lm->builtin_protocol_by_ip_protocol[IP_PROTOCOL_UDP] = IP_BUILTIN_PROTOCOL_UDP; + lm->builtin_protocol_by_ip_protocol[is_ip6 ? IP_PROTOCOL_ICMP6 : IP_PROTOCOL_ICMP] = IP_BUILTIN_PROTOCOL_ICMP; + } +} + +u8 * format_ip_flow_hash_config (u8 * s, va_list * args) +{ + u32 flow_hash_config = va_arg (*args, u32); + +#define _(n,v) if (flow_hash_config & v) s = format (s, "%s ", #n); + foreach_flow_hash_bit; +#undef _ + + return s; +} + +u8 * format_ip_lookup_next (u8 * s, va_list * args) +{ + ip_lookup_next_t n = va_arg (*args, ip_lookup_next_t); + char * t = 0; + + switch (n) + { + default: + s = format (s, "unknown %d", n); + return s; + + case IP_LOOKUP_NEXT_MISS: t = "miss"; break; + case IP_LOOKUP_NEXT_DROP: t = "drop"; break; + case IP_LOOKUP_NEXT_PUNT: t = "punt"; break; + case IP_LOOKUP_NEXT_LOCAL: t = "local"; break; + case IP_LOOKUP_NEXT_ARP: t = "arp"; break; + case IP_LOOKUP_NEXT_CLASSIFY: t = "classify"; break; + case IP_LOOKUP_NEXT_MAP: t = "map"; break; + case IP_LOOKUP_NEXT_MAP_T: t = "map-t"; break; + case IP_LOOKUP_NEXT_SIXRD: t = "sixrd"; break; + case IP_LOOKUP_NEXT_REWRITE: + break; + } + + if (t) + vec_add (s, t, strlen (t)); + + return s; +} + +static u8 * format_ip_interface_address (u8 * s, va_list * args) +{ + ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *); + u32 if_address_index = va_arg (*args, u32); + ip_interface_address_t * ia = pool_elt_at_index (lm->if_address_pool, if_address_index); + void * a = ip_interface_address_get_address (lm, ia); + + if (lm->is_ip6) + return format (s, "%U", format_ip6_address_and_length, a, ia->address_length); + else + return format (s, "%U", format_ip4_address_and_length, a, ia->address_length); +} + +u8 * format_ip_adjacency (u8 * s, va_list * args) +{ + vnet_main_t * vnm = va_arg (*args, vnet_main_t *); + ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *); + u32 adj_index = va_arg (*args, u32); + ip_adjacency_t * adj = ip_get_adjacency (lm, adj_index); + + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_REWRITE: + s = format (s, "%U", + format_vnet_rewrite, + vnm->vlib_main, &adj->rewrite_header, sizeof (adj->rewrite_data)); + break; + + default: + s = format (s, "%U", format_ip_lookup_next, adj->lookup_next_index); + if (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP) + s = format (s, " %U", + format_vnet_sw_interface_name, + vnm, + vnet_get_sw_interface (vnm, adj->rewrite_header.sw_if_index)); + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_ARP: + case IP_LOOKUP_NEXT_LOCAL: + if (adj->if_address_index != ~0) + s = format (s, " %U", format_ip_interface_address, lm, adj->if_address_index); + break; + + case IP_LOOKUP_NEXT_CLASSIFY: + s = format (s, " table %d", adj->classify_table_index); + + default: + break; + } + break; + } + if (adj->explicit_fib_index != ~0 && adj->explicit_fib_index != 0) + s = format (s, " lookup fib index %d", adj->explicit_fib_index); + + return s; +} + +u8 * format_ip_adjacency_packet_data (u8 * s, va_list * args) +{ + vnet_main_t * vnm = va_arg (*args, vnet_main_t *); + ip_lookup_main_t * lm = va_arg (*args, ip_lookup_main_t *); + u32 adj_index = va_arg (*args, u32); + u8 * packet_data = va_arg (*args, u8 *); + u32 n_packet_data_bytes = va_arg (*args, u32); + ip_adjacency_t * adj = ip_get_adjacency (lm, adj_index); + + switch (adj->lookup_next_index) + { + case IP_LOOKUP_NEXT_REWRITE: + s = format (s, "%U", + format_vnet_rewrite_header, + vnm->vlib_main, &adj->rewrite_header, packet_data, n_packet_data_bytes); + break; + + default: + break; + } + + return s; +} + +static uword unformat_ip_lookup_next (unformat_input_t * input, va_list * args) +{ + ip_lookup_next_t * result = va_arg (*args, ip_lookup_next_t *); + ip_lookup_next_t n; + + if (unformat (input, "drop")) + n = IP_LOOKUP_NEXT_DROP; + + else if (unformat (input, "punt")) + n = IP_LOOKUP_NEXT_PUNT; + + else if (unformat (input, "local")) + n = IP_LOOKUP_NEXT_LOCAL; + + else if (unformat (input, "arp")) + n = IP_LOOKUP_NEXT_ARP; + + else if (unformat (input, "classify")) + n = IP_LOOKUP_NEXT_CLASSIFY; + + else + return 0; + + *result = n; + return 1; +} + +static uword unformat_ip_adjacency (unformat_input_t * input, va_list * args) +{ + vlib_main_t * vm = va_arg (*args, vlib_main_t *); + ip_adjacency_t * adj = va_arg (*args, ip_adjacency_t *); + u32 node_index = va_arg (*args, u32); + vnet_main_t * vnm = vnet_get_main(); + u32 sw_if_index, is_ip6; + ip46_address_t a46; + ip_lookup_next_t next; + + is_ip6 = node_index == ip6_rewrite_node.index; + adj->rewrite_header.node_index = node_index; + adj->explicit_fib_index = ~0; + + if (unformat (input, "arp %U %U", + unformat_vnet_sw_interface, vnm, &sw_if_index, + unformat_ip46_address, &a46, is_ip6)) + { + ip_lookup_main_t * lm = is_ip6 ? &ip6_main.lookup_main : &ip4_main.lookup_main; + ip_adjacency_t * a_adj; + u32 adj_index; + + if (is_ip6) + adj_index = ip6_fib_lookup (&ip6_main, sw_if_index, &a46.ip6); + else + adj_index = ip4_fib_lookup (&ip4_main, sw_if_index, &a46.ip4); + + a_adj = ip_get_adjacency (lm, adj_index); + + if (a_adj->rewrite_header.sw_if_index != sw_if_index) + return 0; + + if (is_ip6) + ip6_adjacency_set_interface_route (vnm, adj, sw_if_index, a_adj->if_address_index); + else + ip4_adjacency_set_interface_route (vnm, adj, sw_if_index, a_adj->if_address_index); + } + + else if (unformat_user (input, unformat_ip_lookup_next, &next)) + { + adj->lookup_next_index = next; + adj->if_address_index = ~0; + if (next == IP_LOOKUP_NEXT_LOCAL) + (void) unformat (input, "%d", &adj->if_address_index); + else if (next == IP_LOOKUP_NEXT_CLASSIFY) + if (!unformat (input, "%d", &adj->classify_table_index)) + { + clib_warning ("classify adj must specify table index"); + return 0; + } + } + + else if (unformat_user (input, + unformat_vnet_rewrite, + vm, &adj->rewrite_header, sizeof (adj->rewrite_data))) + adj->lookup_next_index = IP_LOOKUP_NEXT_REWRITE; + + else + return 0; + + return 1; +} + +clib_error_t * +vnet_ip_route_cmd (vlib_main_t * vm, unformat_input_t * main_input, vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * error = 0; + u32 table_id, is_del; + u32 weight, * weights = 0; + u32 * table_ids = 0; + u32 sw_if_index, * sw_if_indices = 0; + ip4_address_t ip4_addr, * ip4_dst_addresses = 0, * ip4_via_next_hops = 0; + ip6_address_t ip6_addr, * ip6_dst_addresses = 0, * ip6_via_next_hops = 0; + u32 dst_address_length, * dst_address_lengths = 0; + ip_adjacency_t parse_adj, * add_adj = 0; + unformat_input_t _line_input, * line_input = &_line_input; + f64 count; + u32 outer_table_id; + + is_del = 0; + table_id = 0; + count = 1; + + /* Get a line of input. */ + if (! unformat_user (main_input, unformat_line_input, line_input)) + return 0; + + memset(&parse_adj, 0, sizeof (parse_adj)); + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (line_input, "table %d", &table_id)) + ; + else if (unformat (line_input, "del")) + is_del = 1; + else if (unformat (line_input, "add")) + is_del = 0; + else if (unformat (line_input, "count %f", &count)) + ; + + else if (unformat (line_input, "%U/%d", + unformat_ip4_address, &ip4_addr, + &dst_address_length)) + { + vec_add1 (ip4_dst_addresses, ip4_addr); + vec_add1 (dst_address_lengths, dst_address_length); + } + + else if (unformat (line_input, "%U/%d", + unformat_ip6_address, &ip6_addr, + &dst_address_length)) + { + vec_add1 (ip6_dst_addresses, ip6_addr); + vec_add1 (dst_address_lengths, dst_address_length); + } + + else if (unformat (line_input, "via %U %U weight %u", + unformat_ip4_address, &ip4_addr, + unformat_vnet_sw_interface, vnm, &sw_if_index, + &weight)) + { + vec_add1 (ip4_via_next_hops, ip4_addr); + vec_add1 (sw_if_indices, sw_if_index); + vec_add1 (weights, weight); + vec_add1 (table_ids, (u32)~0); + } + + else if (unformat (line_input, "via %U %U weight %u", + unformat_ip6_address, &ip6_addr, + unformat_vnet_sw_interface, vnm, &sw_if_index, + &weight)) + { + vec_add1 (ip6_via_next_hops, ip6_addr); + vec_add1 (sw_if_indices, sw_if_index); + vec_add1 (weights, weight); + vec_add1 (table_ids, (u32)~0); + } + + else if (unformat (line_input, "via %U %U", + unformat_ip4_address, &ip4_addr, + unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + vec_add1 (ip4_via_next_hops, ip4_addr); + vec_add1 (sw_if_indices, sw_if_index); + vec_add1 (weights, 1); + vec_add1 (table_ids, (u32)~0); + } + + else if (unformat (line_input, "via %U %U", + unformat_ip6_address, &ip6_addr, + unformat_vnet_sw_interface, vnm, &sw_if_index)) + { + vec_add1 (ip6_via_next_hops, ip6_addr); + vec_add1 (sw_if_indices, sw_if_index); + vec_add1 (weights, 1); + vec_add1 (table_ids, (u32)~0); + } + else if (unformat (line_input, "via %U", + unformat_ip4_address, &ip4_addr)) + { + vec_add1 (ip4_via_next_hops, ip4_addr); + vec_add1 (sw_if_indices, (u32)~0); + vec_add1 (weights, 1); + vec_add1 (table_ids, table_id); + } + else if (unformat (line_input, "via %U", + unformat_ip6_address, &ip6_addr)) + { + vec_add1 (ip6_via_next_hops, ip6_addr); + vec_add1 (sw_if_indices, (u32)~0); + vec_add1 (weights, 1); + vec_add1 (table_ids, (u32)table_id); + } + + else if (vec_len (ip4_dst_addresses) > 0 + && unformat (line_input, "via %U", + unformat_ip_adjacency, vm, &parse_adj, ip4_rewrite_node.index)) + vec_add1 (add_adj, parse_adj); + + else if (vec_len (ip6_dst_addresses) > 0 + && unformat (line_input, "via %U", + unformat_ip_adjacency, vm, &parse_adj, ip6_rewrite_node.index)) + vec_add1 (add_adj, parse_adj); + else if (unformat (line_input, "lookup in table %d", &outer_table_id)) + { + uword * p; + + if (vec_len (ip4_dst_addresses) > 0) + p = hash_get (ip4_main.fib_index_by_table_id, outer_table_id); + else + p = hash_get (ip6_main.fib_index_by_table_id, outer_table_id); + + if (p == 0) + { + error = clib_error_return (0, "Nonexistent outer table id %d", + outer_table_id); + goto done; + } + + parse_adj.lookup_next_index = IP_LOOKUP_NEXT_LOCAL; + parse_adj.explicit_fib_index = p[0]; + vec_add1 (add_adj, parse_adj); + } + else + { + error = unformat_parse_error (line_input); + goto done; + } + } + + unformat_free (line_input); + + if (vec_len (ip4_dst_addresses) + vec_len (ip6_dst_addresses) == 0) + { + error = clib_error_return (0, "expected ip4/ip6 destination address/length."); + goto done; + } + + if (vec_len (ip4_dst_addresses) > 0 && vec_len (ip6_dst_addresses) > 0) + { + error = clib_error_return (0, "mixed ip4/ip6 address/length."); + goto done; + } + + if (vec_len (ip4_dst_addresses) > 0 && vec_len (ip6_via_next_hops) > 0) + { + error = clib_error_return (0, "ip4 destinations with ip6 next hops."); + goto done; + } + + if (vec_len (ip6_dst_addresses) > 0 && vec_len (ip4_via_next_hops) > 0) + { + error = clib_error_return (0, "ip6 destinations with ip4 next hops."); + goto done; + } + + if (! is_del && vec_len (add_adj) + vec_len (weights) == 0) + { + error = clib_error_return (0, "no next hops or adjacencies to add."); + goto done; + } + + if (vec_len(ip4_via_next_hops)) + { + if (sw_if_indices[0] == (u32)~0) + { + u32 ai; + uword * p; + u32 fib_index; + ip_adjacency_t *nh_adj; + + p = hash_get (ip4_main.fib_index_by_table_id, table_ids[0]); + if (p == 0) + { + error = clib_error_return (0, "Nonexistent FIB id %d", + table_ids[0]); + goto done; + } + + fib_index = p[0]; + + ai = ip4_fib_lookup_with_table (&ip4_main, + fib_index, + ip4_via_next_hops, + 1 /* disable default route */); + if (ai == 0) + { + error = clib_error_return (0, "next hop %U not in FIB", + format_ip4_address, + ip4_via_next_hops); + goto done; + } + nh_adj = ip_get_adjacency (&ip4_main.lookup_main, ai); + vec_add1 (add_adj, nh_adj[0]); + } + } + if (vec_len(ip6_via_next_hops)) + { + if (sw_if_indices[0] == (u32)~0) + { + u32 ai; + uword * p; + u32 fib_index; + ip_adjacency_t *nh_adj; + + p = hash_get (ip6_main.fib_index_by_table_id, table_ids[0]); + if (p == 0) + { + error = clib_error_return (0, "Nonexistent FIB id %d", + table_ids[0]); + goto done; + } + + fib_index = p[0]; + ai = ip6_fib_lookup_with_table (&ip6_main, + fib_index, + ip6_via_next_hops); + if (ai == 0) + { + error = clib_error_return (0, "next hop %U not in FIB", + format_ip6_address, + ip6_via_next_hops); + goto done; + } + nh_adj = ip_get_adjacency (&ip6_main.lookup_main, ai); + vec_add1 (add_adj, nh_adj[0]); + } + } + + { + int i; + ip4_main_t * im4 = &ip4_main; + ip6_main_t * im6 = &ip6_main; + + for (i = 0; i < vec_len (ip4_dst_addresses); i++) + { + ip4_add_del_route_args_t a; + + memset (&a, 0, sizeof (a)); + a.flags = IP4_ROUTE_FLAG_TABLE_ID; + a.table_index_or_table_id = table_id; + a.dst_address = ip4_dst_addresses[i]; + a.dst_address_length = dst_address_lengths[i]; + a.adj_index = ~0; + + if (is_del) + { + if (vec_len (ip4_via_next_hops) == 0) + { + uword * dst_hash, * dst_result; + u32 dst_address_u32; + ip4_fib_t * fib; + + fib = find_ip4_fib_by_table_index_or_id (im4, table_id, + 0 /* by table id */); + + a.flags |= IP4_ROUTE_FLAG_DEL; + dst_address_u32 = a.dst_address.as_u32 + & im4->fib_masks[a.dst_address_length]; + + dst_hash = + fib->adj_index_by_dst_address[a.dst_address_length]; + dst_result = hash_get (dst_hash, dst_address_u32); + if (dst_result) + a.adj_index = dst_result[0]; + else + { + clib_warning ("%U/%d not in FIB", + format_ip4_address, &a.dst_address, + a.dst_address_length); + continue; + } + + ip4_add_del_route (im4, &a); + ip4_maybe_remap_adjacencies (im4, table_id, + IP4_ROUTE_FLAG_TABLE_ID); + } + else + { + u32 i, j, n, f, incr; + ip4_address_t dst = a.dst_address; + f64 t[2]; + n = count; + t[0] = vlib_time_now (vm); + incr = 1<<(32 - a.dst_address_length); + for (i = 0; i < n; i++) + { + f = i + 1 < n ? IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP : 0; + a.dst_address = dst; + for (j = 0; j < vec_len (ip4_via_next_hops); j++) + { + if (table_ids[j] != (u32)~0) + { + uword * p = hash_get (im4->fib_index_by_table_id, + table_ids[j]); + if (p == 0) + { + clib_warning ("no such FIB table %d", + table_ids[j]); + continue; + } + table_ids[j] = p[0]; + } + + ip4_add_del_route_next_hop (im4, + IP4_ROUTE_FLAG_DEL | f, + &a.dst_address, + a.dst_address_length, + &ip4_via_next_hops[j], + sw_if_indices[j], + weights[j], (u32)~0, + table_ids[j] /* fib index */); + } + dst.as_u32 = clib_host_to_net_u32 (incr + clib_net_to_host_u32 (dst.as_u32)); + } + t[1] = vlib_time_now (vm); + if (count > 1) + vlib_cli_output (vm, "%.6e routes/sec", count / (t[1] - t[0])); + } + } + else + { + if (vec_len (add_adj) > 0) + { + a.flags |= IP4_ROUTE_FLAG_ADD; + a.add_adj = add_adj; + a.n_add_adj = vec_len (add_adj); + + ip4_add_del_route (im4, &a); + } + else if (vec_len (ip4_via_next_hops) > 0) + { + u32 i, j, n, f, incr; + ip4_address_t dst = a.dst_address; + f64 t[2]; + n = count; + t[0] = vlib_time_now (vm); + incr = 1<<(32 - a.dst_address_length); + for (i = 0; i < n; i++) + { + f = i + 1 < n ? IP4_ROUTE_FLAG_NOT_LAST_IN_GROUP : 0; + a.dst_address = dst; + for (j = 0; j < vec_len (ip4_via_next_hops); j++) + { + if (table_ids[j] != (u32)~0) + { + uword * p = hash_get (im4->fib_index_by_table_id, + table_ids[j]); + if (p == 0) + { + clib_warning ("no such FIB table %d", + table_ids[j]); + continue; + } + table_ids[j] = p[0]; + } + ip4_add_del_route_next_hop (im4, + IP4_ROUTE_FLAG_ADD | f, + &a.dst_address, + a.dst_address_length, + &ip4_via_next_hops[j], + sw_if_indices[j], + weights[j], (u32)~0, + table_ids[j] /* fib index */); + } + dst.as_u32 = clib_host_to_net_u32 (incr + clib_net_to_host_u32 (dst.as_u32)); + } + t[1] = vlib_time_now (vm); + if (count > 1) + vlib_cli_output (vm, "%.6e routes/sec", count / (t[1] - t[0])); + } + } + } + + for (i = 0; i < vec_len (ip6_dst_addresses); i++) + { + ip6_add_del_route_args_t a; + + + memset (&a, 0, sizeof (a)); + a.flags = IP6_ROUTE_FLAG_TABLE_ID; + a.table_index_or_table_id = table_id; + a.dst_address = ip6_dst_addresses[i]; + a.dst_address_length = dst_address_lengths[i]; + a.adj_index = ~0; + + if (is_del) + { + if (vec_len (ip6_via_next_hops) == 0) + { + BVT(clib_bihash_kv) kv, value; + ip6_address_t dst_address; + ip6_fib_t * fib; + + fib = find_ip6_fib_by_table_index_or_id (im6, table_id, + 0 /* by table id */); + + a.flags |= IP4_ROUTE_FLAG_DEL; + + dst_address = ip6_dst_addresses[i]; + + ip6_address_mask (&dst_address, + &im6->fib_masks[dst_address_length]); + + kv.key[0] = dst_address.as_u64[0]; + kv.key[1] = dst_address.as_u64[1]; + kv.key[2] = ((u64)(fib - im6->fibs)<<32) + | a.dst_address_length; + + if (BV(clib_bihash_search)(&im6->ip6_lookup_table, + &kv, &value) == 0) + a.adj_index = value.value; + else + { + clib_warning ("%U/%d not in FIB", + format_ip6_address, &a.dst_address, + a.dst_address_length); + continue; + } + + a.flags |= IP6_ROUTE_FLAG_DEL; + ip6_add_del_route (im6, &a); + ip6_maybe_remap_adjacencies (im6, table_id, + IP6_ROUTE_FLAG_TABLE_ID); + } + else + { + u32 i; + for (i = 0; i < vec_len (ip6_via_next_hops); i++) + { + ip6_add_del_route_next_hop (im6, + IP6_ROUTE_FLAG_DEL, + &a.dst_address, + a.dst_address_length, + &ip6_via_next_hops[i], + sw_if_indices[i], + weights[i], (u32)~0, + table_ids[i] /* fib index */); + } + } + } + else + { + if (vec_len (add_adj) > 0) + { + a.flags |= IP6_ROUTE_FLAG_ADD; + a.add_adj = add_adj; + a.n_add_adj = vec_len (add_adj); + + ip6_add_del_route (im6, &a); + } + else if (vec_len (ip6_via_next_hops) > 0) + { + u32 i; + for (i = 0; i < vec_len (ip6_via_next_hops); i++) + { + ip6_add_del_route_next_hop (im6, + IP6_ROUTE_FLAG_ADD, + &a.dst_address, + a.dst_address_length, + &ip6_via_next_hops[i], + sw_if_indices[i], + weights[i], (u32)~0, + table_ids[i]); + } + } + } + } + } + + done: + vec_free (add_adj); + vec_free (weights); + vec_free (dst_address_lengths); + vec_free (ip4_dst_addresses); + vec_free (ip6_dst_addresses); + vec_free (ip4_via_next_hops); + vec_free (ip6_via_next_hops); + return error; +} + +VLIB_CLI_COMMAND (vlib_cli_ip_command, static) = { + .path = "ip", + .short_help = "Internet protocol (IP) commands", +}; + +VLIB_CLI_COMMAND (vlib_cli_show_ip_command, static) = { + .path = "show ip", + .short_help = "Internet protocol (IP) show commands", +}; + +VLIB_CLI_COMMAND (vlib_cli_show_ip4_command, static) = { + .path = "show ip4", + .short_help = "Internet protocol version 4 (IP4) show commands", +}; + +VLIB_CLI_COMMAND (vlib_cli_show_ip6_command, static) = { + .path = "show ip6", + .short_help = "Internet protocol version 6 (IP6) show commands", +}; + +VLIB_CLI_COMMAND (ip_route_command, static) = { + .path = "ip route", + .short_help = "Add/delete IP routes", + .function = vnet_ip_route_cmd, +}; + +/* + * The next two routines address a longstanding script hemorrhoid. + * Probing a v4 or v6 neighbor needs to appear to be synchronous, + * or dependent route-adds will simply fail. + */ +static clib_error_t * +ip6_probe_neighbor_wait (vlib_main_t *vm, ip6_address_t * a, u32 sw_if_index, + int retry_count) +{ + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * e; + int i; + int resolved = 0; + uword event_type; + uword *event_data = 0; + + ASSERT (vlib_in_process_context(vm)); + + if (retry_count > 0) + vnet_register_ip6_neighbor_resolution_event + (vnm, a, vlib_get_current_process (vm)->node_runtime.node_index, + 1 /* event */, 0 /* data */); + + for (i = 0; i < retry_count; i++) + { + /* The interface may be down, etc. */ + e = ip6_probe_neighbor (vm, a, sw_if_index); + + if (e) + return e; + + vlib_process_wait_for_event_or_clock (vm, 1.0); + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case 1: /* resolved... */ + vlib_cli_output (vm, "Resolved %U", + format_ip6_address, a); + resolved = 1; + goto done; + + case ~0: /* timeout */ + break; + + default: + clib_warning ("unknown event_type %d", event_type); + } + } + + done: + vec_reset_length (event_data); + + if (!resolved) + return clib_error_return (0, "Resolution failed for %U", + format_ip6_address, a); + return 0; +} + +static clib_error_t * +ip4_probe_neighbor_wait (vlib_main_t *vm, ip4_address_t * a, u32 sw_if_index, + int retry_count) +{ + vnet_main_t * vnm = vnet_get_main(); + clib_error_t * e; + int i; + int resolved = 0; + uword event_type; + uword *event_data = 0; + + ASSERT (vlib_in_process_context(vm)); + + if (retry_count > 0) + vnet_register_ip4_arp_resolution_event + (vnm, a, vlib_get_current_process (vm)->node_runtime.node_index, + 1 /* event */, 0 /* data */); + + for (i = 0; i < retry_count; i++) + { + /* The interface may be down, etc. */ + e = ip4_probe_neighbor (vm, a, sw_if_index); + + if (e) + return e; + + vlib_process_wait_for_event_or_clock (vm, 1.0); + event_type = vlib_process_get_events (vm, &event_data); + switch (event_type) + { + case 1: /* resolved... */ + vlib_cli_output (vm, "Resolved %U", + format_ip4_address, a); + resolved = 1; + goto done; + + case ~0: /* timeout */ + break; + + default: + clib_warning ("unknown event_type %d", event_type); + } + } + + done: + + vec_reset_length (event_data); + + if (!resolved) + return clib_error_return (0, "Resolution failed for %U", + format_ip4_address, a); + return 0; +} + +static clib_error_t * +probe_neighbor_address (vlib_main_t * vm, + unformat_input_t * input, + vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + unformat_input_t _line_input, * line_input = &_line_input; + ip4_address_t a4; + ip6_address_t a6; + clib_error_t * error = 0; + u32 sw_if_index = ~0; + int retry_count = 3; + int is_ip4 = 1; + int address_set = 0; + + /* Get a line of input. */ + if (! unformat_user (input, unformat_line_input, line_input)) + return 0; + + while (unformat_check_input (line_input) != UNFORMAT_END_OF_INPUT) + { + if (unformat_user (line_input, unformat_vnet_sw_interface, vnm, + &sw_if_index)) + ; + else if (unformat (line_input, "retry %d", &retry_count)) + ; + + else if (unformat (line_input, "%U", unformat_ip4_address, &a4)) + address_set++; + else if (unformat (line_input, "%U", unformat_ip6_address, &a6)) + { + address_set++; + is_ip4 = 0; + } + else + return clib_error_return (0, "unknown input '%U'", + format_unformat_error, line_input); + } + + unformat_free (line_input); + + if (sw_if_index == ~0) + return clib_error_return (0, "Interface required, not set."); + if (address_set == 0) + return clib_error_return (0, "ip address required, not set."); + if (address_set > 1) + return clib_error_return (0, "Multiple ip addresses not supported."); + + if (is_ip4) + error = ip4_probe_neighbor_wait (vm, &a4, sw_if_index, retry_count); + else + error = ip6_probe_neighbor_wait (vm, &a6, sw_if_index, retry_count); + + return error; +} + +VLIB_CLI_COMMAND (ip_probe_neighbor_command, static) = { + .path = "ip probe-neighbor", + .function = probe_neighbor_address, + .short_help = "ip probe-neighbor <intfc> <ip4-addr> | <ip6-addr> [retry nn]", +}; + +typedef CLIB_PACKED (struct { + ip4_address_t address; + + u32 address_length : 6; + + u32 index : 26; +}) ip4_route_t; + +static clib_error_t * +ip4_show_fib (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + ip4_main_t * im4 = &ip4_main; + ip4_route_t * routes, * r; + ip4_fib_t * fib; + ip_lookup_main_t * lm = &im4->lookup_main; + uword * results, i; + int verbose, matching, mtrie, include_empty_fibs; + ip4_address_t matching_address; + u8 clear = 0; + int table_id = -1; + + routes = 0; + results = 0; + verbose = 1; + include_empty_fibs = 0; + matching = 0; + mtrie = 0; + while (unformat_check_input (input) != UNFORMAT_END_OF_INPUT) + { + if (unformat (input, "brief") || unformat (input, "summary") + || unformat (input, "sum")) + verbose = 0; + + else if (unformat (input, "mtrie")) + mtrie = 1; + + else if (unformat (input, "include-empty")) + include_empty_fibs = 1; + + else if (unformat (input, "%U", unformat_ip4_address, &matching_address)) + matching = 1; + + else if (unformat (input, "clear")) + clear = 1; + + else if (unformat (input, "table %d", &table_id)) + ; + else + break; + } + + vec_foreach (fib, im4->fibs) + { + int fib_not_empty; + + fib_not_empty = 0; + for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++) + { + uword * hash = fib->adj_index_by_dst_address[i]; + uword n_elts = hash_elts (hash); + if (n_elts) + { + fib_not_empty = 1; + break; + } + } + + if (fib_not_empty == 0 && include_empty_fibs == 0) + continue; + + if (table_id >= 0 && table_id != (int)fib->table_id) + continue; + + if (include_empty_fibs) + vlib_cli_output (vm, "Table %d, fib_index %d, flow hash: %U", + fib->table_id, fib - im4->fibs, + format_ip_flow_hash_config, fib->flow_hash_config); + + /* Show summary? */ + if (! verbose) + { + if (include_empty_fibs == 0) + vlib_cli_output (vm, "Table %d, fib_index %d, flow hash: %U", + fib->table_id, fib - im4->fibs, + format_ip_flow_hash_config, fib->flow_hash_config); + vlib_cli_output (vm, "%=20s%=16s", "Prefix length", "Count"); + for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++) + { + uword * hash = fib->adj_index_by_dst_address[i]; + uword n_elts = hash_elts (hash); + if (n_elts > 0) + vlib_cli_output (vm, "%20d%16d", i, n_elts); + } + continue; + } + + if (routes) + _vec_len (routes) = 0; + if (results) + _vec_len (results) = 0; + + for (i = 0; i < ARRAY_LEN (fib->adj_index_by_dst_address); i++) + { + uword * hash = fib->adj_index_by_dst_address[i]; + hash_pair_t * p; + ip4_route_t x; + + x.address_length = i; + + if (matching) + { + x.address.as_u32 = matching_address.as_u32 & im4->fib_masks[i]; + p = hash_get_pair (hash, x.address.as_u32); + if (p) + { + if (lm->fib_result_n_words > 1) + { + x.index = vec_len (results); + vec_add (results, p->value, lm->fib_result_n_words); + } + else + x.index = p->value[0]; + vec_add1 (routes, x); + } + } + else + { + hash_foreach_pair (p, hash, ({ + x.address.data_u32 = p->key; + if (lm->fib_result_n_words > 1) + { + x.index = vec_len (results); + vec_add (results, p->value, lm->fib_result_n_words); + } + else + x.index = p->value[0]; + + vec_add1 (routes, x); + })); + } + } + + vec_sort (routes, r1, r2, + ({ int cmp = ip4_address_compare (&r1->address, &r2->address); + cmp ? cmp : ((int) r1->address_length - (int) r2->address_length); })); + if (vec_len(routes)) { + if (include_empty_fibs == 0) + vlib_cli_output (vm, "Table %d, fib_index %d, flow hash: %U", + fib->table_id, fib - im4->fibs, + format_ip_flow_hash_config, fib->flow_hash_config); + if (mtrie) + vlib_cli_output (vm, "%U", format_ip4_fib_mtrie, &fib->mtrie); + vlib_cli_output (vm, "%=20s%=16s%=16s%=16s", + "Destination", "Packets", "Bytes", "Adjacency"); + } + vec_foreach (r, routes) + { + vlib_counter_t c, sum; + uword i, j, n_left, n_nhs, adj_index, * result = 0; + ip_adjacency_t * adj; + ip_multipath_next_hop_t * nhs, tmp_nhs[1]; + + adj_index = r->index; + if (lm->fib_result_n_words > 1) + { + result = vec_elt_at_index (results, adj_index); + adj_index = result[0]; + } + + adj = ip_get_adjacency (lm, adj_index); + if (adj->n_adj == 1) + { + nhs = &tmp_nhs[0]; + nhs[0].next_hop_adj_index = ~0; /* not used */ + nhs[0].weight = 1; + n_nhs = 1; + } + else + { + ip_multipath_adjacency_t * madj; + madj = vec_elt_at_index (lm->multipath_adjacencies, adj->heap_handle); + nhs = heap_elt_at_index (lm->next_hop_heap, madj->normalized_next_hops.heap_offset); + n_nhs = madj->normalized_next_hops.count; + } + + n_left = nhs[0].weight; + vlib_counter_zero (&sum); + for (i = j = 0; i < adj->n_adj; i++) + { + n_left -= 1; + vlib_get_combined_counter (&lm->adjacency_counters, + adj_index + i, &c); + if (clear) + vlib_zero_combined_counter (&lm->adjacency_counters, + adj_index + i); + vlib_counter_add (&sum, &c); + if (n_left == 0) + { + u8 * msg = 0; + uword indent; + + if (j == 0) + msg = format (msg, "%-20U", + format_ip4_address_and_length, + r->address.data, r->address_length); + else + msg = format (msg, "%U", format_white_space, 20); + + msg = format (msg, "%16Ld%16Ld ", sum.packets, sum.bytes); + + indent = vec_len (msg); + msg = format (msg, "weight %d, index %d\n%U%U", + nhs[j].weight, adj_index + i, + format_white_space, indent, + format_ip_adjacency, + vnm, lm, adj_index + i); + + vlib_cli_output (vm, "%v", msg); + vec_free (msg); + + if (result && lm->format_fib_result) + vlib_cli_output (vm, "%20s%U", "", + lm->format_fib_result, vm, lm, result, + i + 1 - nhs[j].weight, + nhs[j].weight); + + j++; + if (j < n_nhs) + { + n_left = nhs[j].weight; + vlib_counter_zero (&sum); + } + } + } + } + } + + vec_free (routes); + vec_free (results); + + return 0; +} + +VLIB_CLI_COMMAND (ip4_show_fib_command, static) = { + .path = "show ip fib", + .short_help = "show ip fib [mtrie] [summary] [table <n>] [<ip4-addr>] [clear] [include-empty]", + .function = ip4_show_fib, +}; + +typedef struct { + ip6_address_t address; + + u32 address_length; + + u32 index; +} ip6_route_t; + +typedef struct { + u32 fib_index; + ip6_route_t ** routep; +} add_routes_in_fib_arg_t; + +static void add_routes_in_fib (BVT(clib_bihash_kv) * kvp, void *arg) +{ + add_routes_in_fib_arg_t * ap = arg; + + if (kvp->key[2]>>32 == ap->fib_index) + { + ip6_address_t *addr; + ip6_route_t * r; + addr = (ip6_address_t *) kvp; + vec_add2 (*ap->routep, r, 1); + r->address = addr[0]; + r->address_length = kvp->key[2] & 0xFF; + r->index = kvp->value; + } +} + +typedef struct { + u32 fib_index; + u64 count_by_prefix_length[129]; +} count_routes_in_fib_at_prefix_length_arg_t; + +static void count_routes_in_fib_at_prefix_length +(BVT(clib_bihash_kv) * kvp, void *arg) +{ + count_routes_in_fib_at_prefix_length_arg_t * ap = arg; + int mask_width; + + if ((kvp->key[2]>>32) != ap->fib_index) + return; + + mask_width = kvp->key[2] & 0xFF; + + ap->count_by_prefix_length[mask_width]++; +} + + +static clib_error_t * +ip6_show_fib (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) +{ + vnet_main_t * vnm = vnet_get_main(); + ip6_main_t * im6 = &ip6_main; + ip6_route_t * routes, * r; + ip6_fib_t * fib; + ip_lookup_main_t * lm = &im6->lookup_main; + uword * results; + int verbose; + BVT(clib_bihash) * h = &im6->ip6_lookup_table; + __attribute__((unused)) u8 clear = 0; + add_routes_in_fib_arg_t _a, *a=&_a; + count_routes_in_fib_at_prefix_length_arg_t _ca, *ca = &_ca; + + routes = 0; + results = 0; + verbose = 1; + if (unformat (input, "brief") || unformat (input, "summary") + || unformat (input, "sum")) + verbose = 0; + + if (unformat (input, "clear")) + clear = 1; + + vlib_cli_output (vm, "FIB lookup table: %d buckets, %lld MB heap", + im6->lookup_table_nbuckets, im6->lookup_table_size>>20); + vlib_cli_output (vm, "%U", format_mheap, h->mheap, 0 /*verbose*/); + vlib_cli_output (vm, " "); + + vec_foreach (fib, im6->fibs) + { + vlib_cli_output (vm, "VRF %d, fib_index %d, flow hash: %U", + fib->table_id, fib - im6->fibs, + format_ip_flow_hash_config, fib->flow_hash_config); + + /* Show summary? */ + if (! verbose) + { + int len; + vlib_cli_output (vm, "%=20s%=16s", "Prefix length", "Count"); + + memset (ca, 0, sizeof(*ca)); + ca->fib_index = fib - im6->fibs; + + BV(clib_bihash_foreach_key_value_pair) + (h, count_routes_in_fib_at_prefix_length, ca); + + for (len = 128; len >= 0; len--) + { + if (ca->count_by_prefix_length[len]) + vlib_cli_output (vm, "%=20d%=16lld", + len, ca->count_by_prefix_length[len]); + } + continue; + } + + if (routes) + _vec_len (routes) = 0; + if (results) + _vec_len (results) = 0; + + a->fib_index = fib - im6->fibs; + a->routep = &routes; + + BV(clib_bihash_foreach_key_value_pair)(h, add_routes_in_fib, a); + + vec_sort (routes, r1, r2, + ({ int cmp = ip6_address_compare (&r1->address, &r2->address); + cmp ? cmp : ((int) r1->address_length - (int) r2->address_length); })); + + vlib_cli_output (vm, "%=45s%=16s%=16s%=16s", + "Destination", "Packets", "Bytes", "Adjacency"); + vec_foreach (r, routes) + { + vlib_counter_t c, sum; + uword i, j, n_left, n_nhs, adj_index, * result = 0; + ip_adjacency_t * adj; + ip_multipath_next_hop_t * nhs, tmp_nhs[1]; + + adj_index = r->index; + if (lm->fib_result_n_words > 1) + { + result = vec_elt_at_index (results, adj_index); + adj_index = result[0]; + } + + adj = ip_get_adjacency (lm, adj_index); + if (adj->n_adj == 1) + { + nhs = &tmp_nhs[0]; + nhs[0].next_hop_adj_index = ~0; /* not used */ + nhs[0].weight = 1; + n_nhs = 1; + } + else + { + ip_multipath_adjacency_t * madj; + madj = vec_elt_at_index (lm->multipath_adjacencies, adj->heap_handle); + nhs = heap_elt_at_index (lm->next_hop_heap, madj->normalized_next_hops.heap_offset); + n_nhs = madj->normalized_next_hops.count; + } + + n_left = nhs[0].weight; + vlib_counter_zero (&sum); + for (i = j = 0; i < adj->n_adj; i++) + { + n_left -= 1; + vlib_get_combined_counter (&lm->adjacency_counters, + adj_index + i, &c); + if (clear) + vlib_zero_combined_counter (&lm->adjacency_counters, + adj_index + i); + vlib_counter_add (&sum, &c); + if (n_left == 0) + { + u8 * msg = 0; + uword indent; + + if (j == 0) + msg = format (msg, "%-45U", + format_ip6_address_and_length, + r->address.as_u8, r->address_length); + else + msg = format (msg, "%U", format_white_space, 20); + + msg = format (msg, "%16Ld%16Ld ", sum.packets, sum.bytes); + + indent = vec_len (msg); + msg = format (msg, "weight %d, index %d\n%U%U", + nhs[j].weight, adj_index + i, + format_white_space, indent, + format_ip_adjacency, + vnm, lm, adj_index + i); + + vlib_cli_output (vm, "%v", msg); + vec_free (msg); + + j++; + if (j < n_nhs) + { + n_left = nhs[j].weight; + vlib_counter_zero (&sum); + } + } + } + + if (result && lm->format_fib_result) + vlib_cli_output (vm, "%20s%U", "", lm->format_fib_result, vm, lm, result, 0); + } + vlib_cli_output (vm, " "); + } + + vec_free (routes); + vec_free (results); + + return 0; +} + +VLIB_CLI_COMMAND (ip6_show_fib_command, static) = { + .path = "show ip6 fib", + .short_help = "show ip6 fib [summary] [clear]", + .function = ip6_show_fib, +}; diff --git a/vnet/vnet/ip/lookup.h b/vnet/vnet/ip/lookup.h new file mode 100644 index 00000000000..e4e5acfece3 --- /dev/null +++ b/vnet/vnet/ip/lookup.h @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/ip_lookup.h: ip (4 or 6) lookup structures, adjacencies, ... + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_ip_lookup_h +#define included_ip_lookup_h + +#include <vnet/vnet.h> +#include <vlib/buffer.h> + +/* Next index stored in adjacency. */ +typedef enum { + /* Packet does not match any route in table. */ + IP_LOOKUP_NEXT_MISS, + + /* Adjacency says to drop or punt this packet. */ + IP_LOOKUP_NEXT_DROP, + IP_LOOKUP_NEXT_PUNT, + + /* This packet is for one of our own IP addresses. */ + IP_LOOKUP_NEXT_LOCAL, + + /* This packet matches an "interface route" and packets + need to be passed to ARP to find rewrite string for + this destination. */ + IP_LOOKUP_NEXT_ARP, + + /* This packet is to be rewritten and forwarded to the next + processing node. This is typically the output interface but + might be another node for further output processing. */ + IP_LOOKUP_NEXT_REWRITE, + + /* This packet needs to be classified */ + IP_LOOKUP_NEXT_CLASSIFY, + + /* This packet needs to go to MAP - RFC7596, RFC7597 */ + IP_LOOKUP_NEXT_MAP, + + /* This packet needs to go to MAP with Translation - RFC7599 */ + IP_LOOKUP_NEXT_MAP_T, + + /* This packets needs to go to 6RD (RFC5969) */ + IP_LOOKUP_NEXT_SIXRD, + + /* Hop-by-hop header handling */ + IP_LOOKUP_NEXT_HOP_BY_HOP, + IP_LOOKUP_NEXT_ADD_HOP_BY_HOP, + IP_LOOKUP_NEXT_POP_HOP_BY_HOP, + + IP_LOOKUP_N_NEXT, +} ip_lookup_next_t; + +/* Flow hash configuration */ +#define IP_FLOW_HASH_SRC_ADDR (1<<0) +#define IP_FLOW_HASH_DST_ADDR (1<<1) +#define IP_FLOW_HASH_PROTO (1<<2) +#define IP_FLOW_HASH_SRC_PORT (1<<3) +#define IP_FLOW_HASH_DST_PORT (1<<4) +#define IP_FLOW_HASH_REVERSE_SRC_DST (1<<5) + +/* Default: 5-tuple without the "reverse" bit */ +#define IP_FLOW_HASH_DEFAULT (0x1F) + +#define foreach_flow_hash_bit \ +_(src, IP_FLOW_HASH_SRC_ADDR) \ +_(dst, IP_FLOW_HASH_DST_ADDR) \ +_(sport, IP_FLOW_HASH_SRC_PORT) \ +_(dport, IP_FLOW_HASH_DST_PORT) \ +_(proto, IP_FLOW_HASH_PROTO) \ +_(reverse, IP_FLOW_HASH_REVERSE_SRC_DST) + +/* IP unicast adjacency. */ +typedef struct { + /* Handle for this adjacency in adjacency heap. */ + u32 heap_handle; + + /* Interface address index for this local/arp adjacency. */ + u32 if_address_index; + + /* Number of adjecencies in block. Greater than 1 means multipath; + otherwise equal to 1. */ + u16 n_adj; + + /* Next hop after ip4-lookup. */ + union { + ip_lookup_next_t lookup_next_index : 16; + u16 lookup_next_index_as_int; + }; + + /* Force re-lookup in a different FIB. ~0 => normal behavior */ + i16 explicit_fib_index; + u16 mcast_group_index; + + /* When classifying, start here */ + u16 classify_table_index; + /* Highest possible perf subgraph arc interposition, e.g. for ip6 ioam */ + u16 saved_lookup_next_index; + + vnet_declare_rewrite (VLIB_BUFFER_PRE_DATA_SIZE - 5*sizeof(u32)); +} ip_adjacency_t; + +/* Index into adjacency table. */ +typedef u32 ip_adjacency_index_t; + +typedef struct { + /* Directly connected next-hop adjacency index. */ + u32 next_hop_adj_index; + + /* Path weight for this adjacency. */ + u32 weight; +} ip_multipath_next_hop_t; + +typedef struct { + /* Adjacency index of first index in block. */ + u32 adj_index; + + /* Power of 2 size of adjacency block. */ + u32 n_adj_in_block; + + /* Number of prefixes that point to this adjacency. */ + u32 reference_count; + + /* Normalized next hops are used as hash keys: they are sorted by weight + and weights are chosen so they add up to 1 << log2_n_adj_in_block (with + zero-weighted next hops being deleted). + Unnormalized next hops are saved so that control plane has a record of exactly + what the RIB told it. */ + struct { + /* Number of hops in the multipath. */ + u32 count; + + /* Offset into next hop heap for this block. */ + u32 heap_offset; + + /* Heap handle used to for example free block when we're done with it. */ + u32 heap_handle; + } normalized_next_hops, unnormalized_next_hops; +} ip_multipath_adjacency_t; + +/* IP multicast adjacency. */ +typedef struct { + /* Handle for this adjacency in adjacency heap. */ + u32 heap_handle; + + /* Number of adjecencies in block. */ + u32 n_adj; + + /* Rewrite string. */ + vnet_declare_rewrite (64 - 2*sizeof(u32)); +} ip_multicast_rewrite_t; + +typedef struct { + /* ip4-multicast-rewrite next index. */ + u32 next_index; + + u8 n_rewrite_bytes; + + u8 rewrite_string[64 - 1*sizeof(u32) - 1*sizeof(u8)]; +} ip_multicast_rewrite_string_t; + +typedef struct { + ip_multicast_rewrite_t * rewrite_heap; + + ip_multicast_rewrite_string_t * rewrite_strings; + + /* Negative rewrite string index; >= 0 sw_if_index. + Sorted. Used to hash. */ + i32 ** adjacency_id_vector; + + uword * adjacency_by_id_vector; +} ip_multicast_lookup_main_t; + +typedef struct { + /* Key for mhash; in fact, just a byte offset into mhash key vector. */ + u32 address_key; + + /* Interface which has this address. */ + u32 sw_if_index; + + /* Adjacency for neighbor probe (ARP) for this interface address. */ + u32 neighbor_probe_adj_index; + + /* Address (prefix) length for this interface. */ + u16 address_length; + + /* Will be used for something eventually. Primary vs. secondary? */ + u16 flags; + + /* Next and previous pointers for doubly linked list of + addresses per software interface. */ + u32 next_this_sw_interface; + u32 prev_this_sw_interface; +} ip_interface_address_t; + +typedef enum { + IP_LOCAL_NEXT_DROP, + IP_LOCAL_NEXT_PUNT, + // IP_LOCAL_NEXT_TCP_LOOKUP, + IP_LOCAL_NEXT_UDP_LOOKUP, + IP_LOCAL_NEXT_ICMP, + IP_LOCAL_N_NEXT, +} ip_local_next_t; + +struct ip_lookup_main_t; + +typedef void (* ip_add_del_adjacency_callback_t) (struct ip_lookup_main_t * lm, + u32 adj_index, + ip_adjacency_t * adj, + u32 is_del); + +typedef struct { + vnet_config_main_t config_main; + + u32 * config_index_by_sw_if_index; +} ip_config_main_t; + +typedef struct ip_lookup_main_t { + /* Adjacency heap. */ + ip_adjacency_t * adjacency_heap; + + /* Adjacency packet/byte counters indexed by adjacency index. */ + vlib_combined_counter_main_t adjacency_counters; + + /* Heap of (next hop, weight) blocks. Sorted by next hop. */ + ip_multipath_next_hop_t * next_hop_heap; + + /* Indexed by heap_handle from ip_adjacency_t. */ + ip_multipath_adjacency_t * multipath_adjacencies; + + /* Temporary vectors for looking up next hops in hash. */ + ip_multipath_next_hop_t * next_hop_hash_lookup_key; + ip_multipath_next_hop_t * next_hop_hash_lookup_key_normalized; + + /* Hash table mapping normalized next hops and weights + to multipath adjacency index. */ + uword * multipath_adjacency_by_next_hops; + + u32 * adjacency_remap_table; + u32 n_adjacency_remaps; + + /* If average error per adjacency is less than this threshold adjacency block + size is accepted. */ + f64 multipath_next_hop_error_tolerance; + + /* Adjacency index for routing table misses, local punts, and drops. */ + u32 miss_adj_index, drop_adj_index, local_adj_index; + + /* Miss adjacency is always first in adjacency table. */ +#define IP_LOOKUP_MISS_ADJ_INDEX 0 + + ip_add_del_adjacency_callback_t * add_del_adjacency_callbacks; + + /* Pool of addresses that are assigned to interfaces. */ + ip_interface_address_t * if_address_pool; + + /* Hash table mapping address to index in interface address pool. */ + mhash_t address_to_if_address_index; + + /* Head of doubly linked list of interface addresses for each software interface. + ~0 means this interface has no address. */ + u32 * if_address_pool_index_by_sw_if_index; + + /* First table index to use for this interface, ~0 => none */ + u32 * classify_table_index_by_sw_if_index; + + /* rx/tx interface/feature configuration. */ + ip_config_main_t rx_config_mains[VNET_N_CAST], tx_config_main; + + /* Number of bytes in a fib result. Must be at least + sizeof (uword). First word is always adjacency index. */ + u32 fib_result_n_bytes, fib_result_n_words; + + format_function_t * format_fib_result; + + /* 1 for ip6; 0 for ip4. */ + u32 is_ip6; + + /* Either format_ip4_address_and_length or format_ip6_address_and_length. */ + format_function_t * format_address_and_length; + + /* Table mapping ip protocol to ip[46]-local node next index. */ + u8 local_next_by_ip_protocol[256]; + + /* IP_BUILTIN_PROTOCOL_{TCP,UDP,ICMP,OTHER} by protocol in IP header. */ + u8 builtin_protocol_by_ip_protocol[256]; +} ip_lookup_main_t; + +always_inline ip_adjacency_t * +ip_get_adjacency (ip_lookup_main_t * lm, + u32 adj_index) +{ + ip_adjacency_t * adj; + + adj = heap_elt_at_index (lm->adjacency_heap, adj_index); + + ASSERT (! heap_is_free_handle (lm->adjacency_heap, adj->heap_handle)); + + return adj; +} + +#define ip_prefetch_adjacency(lm,adj_index,type) \ +do { \ + ip_adjacency_t * _adj = (lm)->adjacency_heap + (adj_index); \ + CLIB_PREFETCH (_adj, sizeof (_adj[0]), type); \ +} while (0) + +always_inline void +ip_call_add_del_adjacency_callbacks (ip_lookup_main_t * lm, u32 adj_index, u32 is_del) +{ + ip_adjacency_t * adj; + uword i; + adj = ip_get_adjacency (lm, adj_index); + for (i = 0; i < vec_len (lm->add_del_adjacency_callbacks); i++) + lm->add_del_adjacency_callbacks[i] (lm, adj_index, adj, is_del); +} + +/* Create new block of given number of contiguous adjacencies. */ +ip_adjacency_t * +ip_add_adjacency (ip_lookup_main_t * lm, + ip_adjacency_t * adj, + u32 n_adj, + u32 * adj_index_result); + +void ip_del_adjacency (ip_lookup_main_t * lm, u32 adj_index); + +void +ip_multipath_adjacency_free (ip_lookup_main_t * lm, + ip_multipath_adjacency_t * a); + +u32 +ip_multipath_adjacency_add_del_next_hop (ip_lookup_main_t * lm, + u32 is_del, + u32 old_mp_adj_index, + u32 next_hop_adj_index, + u32 next_hop_weight, + u32 * new_mp_adj_index); + +clib_error_t * +ip_interface_address_add_del (ip_lookup_main_t * lm, + u32 sw_if_index, + void * address, + u32 address_length, + u32 is_del, + u32 * result_index); + +always_inline ip_interface_address_t * +ip_get_interface_address (ip_lookup_main_t * lm, void * addr_fib) +{ + uword * p = mhash_get (&lm->address_to_if_address_index, addr_fib); + return p ? pool_elt_at_index (lm->if_address_pool, p[0]) : 0; +} + +always_inline void * +ip_interface_address_get_address (ip_lookup_main_t * lm, ip_interface_address_t * a) +{ return mhash_key_to_mem (&lm->address_to_if_address_index, a->address_key); } + +always_inline ip_interface_address_t * +ip_interface_address_for_packet (ip_lookup_main_t * lm, vlib_buffer_t * b, u32 sw_if_index) +{ + ip_adjacency_t * adj; + u32 if_address_index; + + adj = ip_get_adjacency (lm, vnet_buffer (b)->ip.adj_index[VLIB_TX]); + + ASSERT (adj->lookup_next_index == IP_LOOKUP_NEXT_ARP + || adj->lookup_next_index == IP_LOOKUP_NEXT_LOCAL); + if_address_index = adj->if_address_index; + if_address_index = (if_address_index == ~0 ? + vec_elt (lm->if_address_pool_index_by_sw_if_index, sw_if_index) + : if_address_index); + + return pool_elt_at_index (lm->if_address_pool, if_address_index); +} + +#define foreach_ip_interface_address(lm,a,sw_if_index,loop,body) \ +do { \ + vnet_main_t *_vnm = vnet_get_main(); \ + u32 _sw_if_index = sw_if_index; \ + vnet_sw_interface_t *_swif; \ + _swif = vnet_get_sw_interface (_vnm, _sw_if_index); \ + \ + /* \ + * Loop => honor unnumbered interface addressing. \ + */ \ + if (loop && _swif->flags & VNET_SW_INTERFACE_FLAG_UNNUMBERED) \ + _sw_if_index = _swif->unnumbered_sw_if_index; \ + u32 _ia = \ + (vec_len((lm)->if_address_pool_index_by_sw_if_index) \ + > (_sw_if_index)) \ + ? vec_elt ((lm)->if_address_pool_index_by_sw_if_index, \ + (_sw_if_index)) : (u32)~0; \ + ip_interface_address_t * _a; \ + while (_ia != ~0) \ + { \ + _a = pool_elt_at_index ((lm)->if_address_pool, _ia); \ + _ia = _a->next_this_sw_interface; \ + (a) = _a; \ + body; \ + } \ +} while (0) + +void ip_lookup_init (ip_lookup_main_t * lm, u32 ip_lookup_node_index); + +serialize_function_t serialize_ip_lookup_main, unserialize_ip_lookup_main; +serialize_function_t serialize_vec_ip_adjacency, unserialize_vec_ip_adjacency; + +#endif /* included_ip_lookup_h */ diff --git a/vnet/vnet/ip/ports.def b/vnet/vnet/ip/ports.def new file mode 100644 index 00000000000..cdb754f5b2e --- /dev/null +++ b/vnet/vnet/ip/ports.def @@ -0,0 +1,757 @@ +/* + * ip/ports.def: tcp/udp port definitions + * + * Eliot Dresselhaus + * August, 2005 + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* +PORT NUMBERS + +(last updated 18 October 2005) + +The port numbers are divided into three ranges: the Well Known Ports, +the Registered Ports, and the Dynamic and/or Private Ports. + +The Well Known Ports are those from 0 through 1023. + +The Registered Ports are those from 1024 through 49151 + +The Dynamic and/or Private Ports are those from 49152 through 65535 + + +************************************************************************ +* PLEASE NOTE THE FOLLOWING: * +* * +* 1. UNASSIGNED PORT NUMBERS SHOULD NOT BE USED. THE IANA WILL ASSIGN * +* THE NUMBER FOR THE PORT AFTER YOUR APPLICATION HAS BEEN APPROVED. * +* * +* 2. ASSIGNMENT OF A PORT NUMBER DOES NOT IN ANY WAY IMPLY AN * +* ENDORSEMENT OF AN APPLICATION OR PRODUCT, AND THE FACT THAT NETWORK * +* TRAFFIC IS FLOWING TO OR FROM A REGISTERED PORT DOES NOT MEAN THAT * +* IT IS "GOOD" TRAFFIC. FIREWALL AND SYSTEM ADMINISTRATORS SHOULD * +* CHOOSE HOW TO CONFIGURE THEIR SYSTEMS BASED ON THEIR KNOWLEDGE OF * +* THE TRAFFIC IN QUESTION, NOT WHETHER THERE IS A PORT NUMBER * +* REGISTERED OR NOT. * +************************************************************************ + + +WELL KNOWN PORT NUMBERS + +The Well Known Ports are assigned by the IANA and on most systems can +only be used by system (or root) processes or by programs executed by +privileged users. + +Ports are used in the TCP [RFC793] to name the ends of logical +connections which carry long term conversations. For the purpose of +providing services to unknown callers, a service contact port is +defined. This list specifies the port used by the server process as +its contact port. The contact port is sometimes called the +"well-known port". + +To the extent possible, these same port assignments are used with the +UDP [RFC768]. + +The range for assigned ports managed by the IANA is 0-1023. +*/ +ip_port (TCPMUX, 1) +ip_port (COMPRESS_NET_MANAGEMENT, 2) +ip_port (COMPRESS_NET, 3) +ip_port (RJE, 5) +ip_port (ECHO, 7) +ip_port (DISCARD, 9) +ip_port (SYSTAT, 11) +ip_port (DAYTIME, 13) +ip_port (QOTD, 17) +ip_port (MSP, 18) +ip_port (CHARGEN, 19) +ip_port (FTP_DATA, 20) +ip_port (FTP, 21) +ip_port (SSH, 22) +ip_port (TELNET, 23) +ip_port (SMTP, 25) +ip_port (NSW_FE, 27) +ip_port (MSG_ICP, 29) +ip_port (MSG_AUTH, 31) +ip_port (DSP, 33) +ip_port (TIME, 37) +ip_port (RAP, 38) +ip_port (RLP, 39) +ip_port (GRAPHICS, 41) +ip_port (NAME, 42) +ip_port (NAMESERVER, 42) +ip_port (NICNAME, 43) +ip_port (MPM_FLAGS, 44) +ip_port (MPM, 45) +ip_port (MPM_SND, 46) +ip_port (NI_FTP, 47) +ip_port (AUDITD, 48) +ip_port (TACACS, 49) +ip_port (RE_MAIL_CK, 50) +ip_port (LA_MAINT, 51) +ip_port (XNS_TIME, 52) +ip_port (DNS, 53) +ip_port (XNS_CH, 54) +ip_port (ISI_GL, 55) +ip_port (XNS_AUTH, 56) +ip_port (XNS_MAIL, 58) +ip_port (NI_MAIL, 61) +ip_port (ACAS, 62) +ip_port (WHOIS_PLUS_PLUS, 63) +ip_port (COVIA, 64) +ip_port (TACACS_DS, 65) +ip_port (ORACLE_SQL_NET, 66) +ip_port (BOOTPS, 67) +ip_port (BOOTPC, 68) +ip_port (TFTP, 69) +ip_port (GOPHER, 70) +ip_port (NETRJS_1, 71) +ip_port (NETRJS_2, 72) +ip_port (NETRJS_3, 73) +ip_port (NETRJS_4, 74) +ip_port (DEOS, 76) +ip_port (VETTCP, 78) +ip_port (FINGER, 79) +ip_port (WWW, 80) +ip_port (HOSTS2_NS, 81) +ip_port (XFER, 82) +ip_port (MIT_ML_DEV, 83) +ip_port (CTF, 84) +ip_port (MIT_ML_DEV1, 85) +ip_port (MFCOBOL, 86) +ip_port (KERBEROS, 88) +ip_port (SU_MIT_TG, 89) +ip_port (DNSIX, 90) +ip_port (MIT_DOV, 91) +ip_port (NPP, 92) +ip_port (DCP, 93) +ip_port (OBJCALL, 94) +ip_port (SUPDUP, 95) +ip_port (DIXIE, 96) +ip_port (SWIFT_RVF, 97) +ip_port (TACNEWS, 98) +ip_port (METAGRAM, 99) +ip_port (NEWACCT, 100) +ip_port (HOSTNAME, 101) +ip_port (ISO_TSAP, 102) +ip_port (GPPITNP, 103) +ip_port (ACR_NEMA, 104) +ip_port (CSO, 105) +ip_port (CSNET_NS, 105) +ip_port (3COM_TSMUX, 106) +ip_port (RTELNET, 107) +ip_port (SNAGAS, 108) +ip_port (POP2, 109) +ip_port (POP3, 110) +ip_port (SUNRPC, 111) +ip_port (MCIDAS, 112) +ip_port (IDENT, 113) +ip_port (SFTP, 115) +ip_port (ANSANOTIFY, 116) +ip_port (UUCP_PATH, 117) +ip_port (SQLSERV, 118) +ip_port (NNTP, 119) +ip_port (CFDPTKT, 120) +ip_port (ERPC, 121) +ip_port (SMAKYNET, 122) +ip_port (NTP, 123) +ip_port (ANSATRADER, 124) +ip_port (LOCUS_MAP, 125) +ip_port (NXEDIT, 126) +ip_port (LOCUS_CON, 127) +ip_port (GSS_XLICEN, 128) +ip_port (PWDGEN, 129) +ip_port (CISCO_FNA, 130) +ip_port (CISCO_TNA, 131) +ip_port (CISCO_SYS, 132) +ip_port (STATSRV, 133) +ip_port (INGRES_NET, 134) +ip_port (EPMAP, 135) +ip_port (PROFILE, 136) +ip_port (NETBIOS_NS, 137) +ip_port (NETBIOS_DGM, 138) +ip_port (NETBIOS_SSN, 139) +ip_port (EMFIS_DATA, 140) +ip_port (EMFIS_CNTL, 141) +ip_port (BL_IDM, 142) +ip_port (IMAP, 143) +ip_port (UMA, 144) +ip_port (UAAC, 145) +ip_port (ISO_TP0, 146) +ip_port (ISO_IP, 147) +ip_port (JARGON, 148) +ip_port (AED_512, 149) +ip_port (SQL_NET, 150) +ip_port (HEMS, 151) +ip_port (BFTP, 152) +ip_port (SGMP, 153) +ip_port (NETSC_PROD, 154) +ip_port (NETSC_DEV, 155) +ip_port (SQLSRV, 156) +ip_port (KNET_CMP, 157) +ip_port (PCMAIL_SRV, 158) +ip_port (NSS_ROUTING, 159) +ip_port (SGMP_TRAPS, 160) +ip_port (SNMP, 161) +ip_port (SNMPTRAP, 162) +ip_port (CMIP_MAN, 163) +ip_port (CMIP_AGENT, 164) +ip_port (XNS_COURIER, 165) +ip_port (S_NET, 166) +ip_port (NAMP, 167) +ip_port (RSVD, 168) +ip_port (SEND, 169) +ip_port (PRINT_SRV, 170) +ip_port (MULTIPLEX, 171) +ip_port (CL1, 172) +ip_port (XYPLEX_MUX, 173) +ip_port (MAILQ, 174) +ip_port (VMNET, 175) +ip_port (GENRAD_MUX, 176) +ip_port (XDMCP, 177) +ip_port (NEXTSTEP, 178) +ip_port (BGP, 179) +ip_port (RIS, 180) +ip_port (UNIFY, 181) +ip_port (AUDIT, 182) +ip_port (OCBINDER, 183) +ip_port (OCSERVER, 184) +ip_port (REMOTE_KIS, 185) +ip_port (KIS, 186) +ip_port (ACI, 187) +ip_port (MUMPS, 188) +ip_port (QFT, 189) +ip_port (GACP, 190) +ip_port (PROSPERO, 191) +ip_port (OSU_NMS, 192) +ip_port (SRMP, 193) +ip_port (IRC, 194) +ip_port (DN6_NLM_AUD, 195) +ip_port (DN6_SMM_RED, 196) +ip_port (DLS, 197) +ip_port (DLS_MON, 198) +ip_port (SMUX, 199) +ip_port (SRC, 200) +ip_port (AT_RTMP, 201) +ip_port (AT_NBP, 202) +ip_port (AT_3, 203) +ip_port (AT_ECHO, 204) +ip_port (AT_5, 205) +ip_port (AT_ZIS, 206) +ip_port (AT_7, 207) +ip_port (AT_8, 208) +ip_port (QMTP, 209) +ip_port (Z39_50, 210) +ip_port (TI914CG, 211) +ip_port (ANET, 212) +ip_port (IPX, 213) +ip_port (VMPWSCS, 214) +ip_port (SOFTPC, 215) +ip_port (CAILIC, 216) +ip_port (DBASE, 217) +ip_port (MPP, 218) +ip_port (UARPS, 219) +ip_port (IMAP3, 220) +ip_port (FLN_SPX, 221) +ip_port (RSH_SPX, 222) +ip_port (CDC, 223) +ip_port (MASQDIALER, 224) +ip_port (DIRECT, 242) +ip_port (SUR_MEAS, 243) +ip_port (INBUSINESS, 244) +ip_port (LINK, 245) +ip_port (DSP3270, 246) +ip_port (SUBNTBCST_TFTP, 247) +ip_port (BHFHS, 248) +ip_port (RAP1, 256) +ip_port (SET, 257) +ip_port (YAK_CHAT, 258) +ip_port (ESRO_GEN, 259) +ip_port (OPENPORT, 260) +ip_port (NSIIOPS, 261) +ip_port (ARCISDMS, 262) +ip_port (HDAP, 263) +ip_port (BGMP, 264) +ip_port (X_BONE_CTL, 265) +ip_port (SST, 266) +ip_port (TD_SERVICE, 267) +ip_port (TD_REPLICA, 268) +ip_port (HTTP_MGMT, 280) +ip_port (PERSONAL_LINK, 281) +ip_port (CABLEPORT_AX, 282) +ip_port (RESCAP, 283) +ip_port (CORERJD, 284) +ip_port (FXP, 286) +ip_port (K_BLOCK, 287) +ip_port (NOVASTORBAKCUP, 308) +ip_port (ENTRUSTTIME, 309) +ip_port (BHMDS, 310) +ip_port (ASIP_WEBADMIN, 311) +ip_port (VSLMP, 312) +ip_port (MAGENTA_LOGIC, 313) +ip_port (OPALIS_ROBOT, 314) +ip_port (DPSI, 315) +ip_port (DECAUTH, 316) +ip_port (ZANNET, 317) +ip_port (PKIX_TIMESTAMP, 318) +ip_port (PTP_EVENT, 319) +ip_port (PTP_GENERAL, 320) +ip_port (PIP, 321) +ip_port (RTSPS, 322) +ip_port (TEXAR, 333) +ip_port (PDAP, 344) +ip_port (PAWSERV, 345) +ip_port (ZSERV, 346) +ip_port (FATSERV, 347) +ip_port (CSI_SGWP, 348) +ip_port (MFTP, 349) +ip_port (MATIP_TYPE_A, 350) +ip_port (MATIP_TYPE_B, 351) +ip_port (BHOETTY, 351) +ip_port (DTAG_STE_SB, 352) +ip_port (BHOEDAP4, 352) +ip_port (NDSAUTH, 353) +ip_port (BH611, 354) +ip_port (DATEX_ASN, 355) +ip_port (CLOANTO_NET_1, 356) +ip_port (BHEVENT, 357) +ip_port (SHRINKWRAP, 358) +ip_port (NSRMP, 359) +ip_port (SCOI2ODIALOG, 360) +ip_port (SEMANTIX, 361) +ip_port (SRSSEND, 362) +ip_port (RSVP_TUNNEL, 363) +ip_port (AURORA_CMGR, 364) +ip_port (DTK, 365) +ip_port (ODMR, 366) +ip_port (MORTGAGEWARE, 367) +ip_port (QBIKGDP, 368) +ip_port (RPC2PORTMAP, 369) +ip_port (CODAAUTH2, 370) +ip_port (CLEARCASE, 371) +ip_port (ULISTPROC, 372) +ip_port (LEGENT_1, 373) +ip_port (LEGENT_2, 374) +ip_port (HASSLE, 375) +ip_port (NIP, 376) +ip_port (TNETOS, 377) +ip_port (DSETOS, 378) +ip_port (IS99C, 379) +ip_port (IS99S, 380) +ip_port (HP_COLLECTOR, 381) +ip_port (HP_MANAGED_NODE, 382) +ip_port (HP_ALARM_MGR, 383) +ip_port (ARNS, 384) +ip_port (IBM_APP, 385) +ip_port (ASA, 386) +ip_port (AURP, 387) +ip_port (UNIDATA_LDM, 388) +ip_port (LDAP, 389) +ip_port (UIS, 390) +ip_port (SYNOTICS_RELAY, 391) +ip_port (SYNOTICS_BROKER, 392) +ip_port (META5, 393) +ip_port (EMBL_NDT, 394) +ip_port (NETCP, 395) +ip_port (NETWARE_IP, 396) +ip_port (MPTN, 397) +ip_port (KRYPTOLAN, 398) +ip_port (ISO_TSAP_C2, 399) +ip_port (WORK_SOL, 400) +ip_port (UPS, 401) +ip_port (GENIE, 402) +ip_port (DECAP, 403) +ip_port (NCED, 404) +ip_port (NCLD, 405) +ip_port (IMSP, 406) +ip_port (TIMBUKTU, 407) +ip_port (PRM_SM, 408) +ip_port (PRM_NM, 409) +ip_port (DECLADEBUG, 410) +ip_port (RMT, 411) +ip_port (SYNOPTICS_TRAP, 412) +ip_port (SMSP, 413) +ip_port (INFOSEEK, 414) +ip_port (BNET, 415) +ip_port (SILVERPLATTER, 416) +ip_port (ONMUX, 417) +ip_port (HYPER_G, 418) +ip_port (ARIEL1, 419) +ip_port (SMPTE, 420) +ip_port (ARIEL2, 421) +ip_port (ARIEL3, 422) +ip_port (OPC_JOB_START, 423) +ip_port (OPC_JOB_TRACK, 424) +ip_port (ICAD_EL, 425) +ip_port (SMARTSDP, 426) +ip_port (SVRLOC, 427) +ip_port (OCS_CMU, 428) +ip_port (OCS_AMU, 429) +ip_port (UTMPSD, 430) +ip_port (UTMPCD, 431) +ip_port (IASD, 432) +ip_port (NNSP, 433) +ip_port (MOBILEIP_AGENT, 434) +ip_port (MOBILIP_MN, 435) +ip_port (DNA_CML, 436) +ip_port (COMSCM, 437) +ip_port (DSFGW, 438) +ip_port (DASP, 439) +ip_port (SGCP, 440) +ip_port (DECVMS_SYSMGT, 441) +ip_port (CVC_HOSTD, 442) +ip_port (HTTPS, 443) +ip_port (SNPP, 444) +ip_port (MICROSOFT_DS, 445) +ip_port (DDM_RDB, 446) +ip_port (DDM_DFM, 447) +ip_port (DDM_SSL, 448) +ip_port (AS_SERVERMAP, 449) +ip_port (TSERVER, 450) +ip_port (SFS_SMP_NET, 451) +ip_port (SFS_CONFIG, 452) +ip_port (CREATIVESERVER, 453) +ip_port (CONTENTSERVER, 454) +ip_port (CREATIVEPARTNR, 455) +ip_port (MACON_TCP, 456) +ip_port (SCOHELP, 457) +ip_port (APPLEQTC, 458) +ip_port (AMPR_RCMD, 459) +ip_port (SKRONK, 460) +ip_port (DATASURFSRV, 461) +ip_port (DATASURFSRVSEC, 462) +ip_port (ALPES, 463) +ip_port (KPASSWD, 464) +ip_port (URD, 465) +ip_port (DIGITAL_VRC, 466) +ip_port (MYLEX_MAPD, 467) +ip_port (PHOTURIS, 468) +ip_port (RCP, 469) +ip_port (SCX_PROXY, 470) +ip_port (MONDEX, 471) +ip_port (LJK_LOGIN, 472) +ip_port (HYBRID_POP, 473) +ip_port (TN_TL_W1, 474) +ip_port (TCPNETHASPSRV, 475) +ip_port (TN_TL_FD1, 476) +ip_port (SS7NS, 477) +ip_port (SPSC, 478) +ip_port (IAFSERVER, 479) +ip_port (IAFDBASE, 480) +ip_port (PH, 481) +ip_port (BGS_NSI, 482) +ip_port (ULPNET, 483) +ip_port (INTEGRA_SME, 484) +ip_port (POWERBURST, 485) +ip_port (AVIAN, 486) +ip_port (SAFT, 487) +ip_port (GSS_HTTP, 488) +ip_port (NEST_PROTOCOL, 489) +ip_port (MICOM_PFS, 490) +ip_port (GO_LOGIN, 491) +ip_port (TICF_1, 492) +ip_port (TICF_2, 493) +ip_port (POV_RAY, 494) +ip_port (INTECOURIER, 495) +ip_port (PIM_RP_DISC, 496) +ip_port (DANTZ, 497) +ip_port (SIAM, 498) +ip_port (ISO_ILL, 499) +ip_port (ISAKMP, 500) +ip_port (STMF, 501) +ip_port (ASA_APPL_PROTO, 502) +ip_port (INTRINSA, 503) +ip_port (CITADEL, 504) +ip_port (MAILBOX_LM, 505) +ip_port (OHIMSRV, 506) +ip_port (CRS, 507) +ip_port (XVTTP, 508) +ip_port (SNARE, 509) +ip_port (FCP, 510) +ip_port (PASSGO, 511) +ip_port (EXEC, 512) +ip_port (LOGIN, 513) +ip_port (SHELL, 514) +ip_port (PRINTER, 515) +ip_port (VIDEOTEX, 516) +ip_port (TALK, 517) +ip_port (NTALK, 518) +ip_port (UTIME, 519) +ip_port (EFS, 520) +ip_port (RIPNG, 521) +ip_port (ULP, 522) +ip_port (IBM_DB2, 523) +ip_port (NCP, 524) +ip_port (TIMED, 525) +ip_port (TEMPO, 526) +ip_port (STX, 527) +ip_port (CUSTIX, 528) +ip_port (IRC_SERV, 529) +ip_port (COURIER, 530) +ip_port (CONFERENCE, 531) +ip_port (NETNEWS, 532) +ip_port (NETWALL, 533) +ip_port (MM_ADMIN, 534) +ip_port (IIOP, 535) +ip_port (OPALIS_RDV, 536) +ip_port (NMSP, 537) +ip_port (GDOMAP, 538) +ip_port (APERTUS_LDP, 539) +ip_port (UUCP, 540) +ip_port (UUCP_RLOGIN, 541) +ip_port (COMMERCE, 542) +ip_port (KLOGIN, 543) +ip_port (KSHELL, 544) +ip_port (APPLEQTCSRVR, 545) +ip_port (DHCPV6_CLIENT, 546) +ip_port (DHCPV6_SERVER, 547) +ip_port (AFPOVERTCP, 548) +ip_port (IDFP, 549) +ip_port (NEW_RWHO, 550) +ip_port (CYBERCASH, 551) +ip_port (DEVSHR_NTS, 552) +ip_port (PIRP, 553) +ip_port (RTSP, 554) +ip_port (DSF, 555) +ip_port (REMOTEFS, 556) +ip_port (OPENVMS_SYSIPC, 557) +ip_port (SDNSKMP, 558) +ip_port (TEEDTAP, 559) +ip_port (RMONITOR, 560) +ip_port (MONITOR, 561) +ip_port (CHSHELL, 562) +ip_port (NNTPS, 563) +ip_port (9PFS, 564) +ip_port (WHOAMI, 565) +ip_port (STREETTALK, 566) +ip_port (BANYAN_RPC, 567) +ip_port (MS_SHUTTLE, 568) +ip_port (MS_ROME, 569) +ip_port (METER, 570) +ip_port (METER1, 571) +ip_port (SONAR, 572) +ip_port (BANYAN_VIP, 573) +ip_port (FTP_AGENT, 574) +ip_port (VEMMI, 575) +ip_port (IPCD, 576) +ip_port (VNAS, 577) +ip_port (IPDD, 578) +ip_port (DECBSRV, 579) +ip_port (SNTP_HEARTBEAT, 580) +ip_port (BDP, 581) +ip_port (SCC_SECURITY, 582) +ip_port (PHILIPS_VC, 583) +ip_port (KEYSERVER, 584) +ip_port (IMAP4_SSL, 585) +ip_port (PASSWORD_CHG, 586) +ip_port (SUBMISSION, 587) +ip_port (CAL, 588) +ip_port (EYELINK, 589) +ip_port (TNS_CML, 590) +ip_port (HTTP_ALT, 591) +ip_port (EUDORA_SET, 592) +ip_port (HTTP_RPC_EPMAP, 593) +ip_port (TPIP, 594) +ip_port (CAB_PROTOCOL, 595) +ip_port (SMSD, 596) +ip_port (PTCNAMESERVICE, 597) +ip_port (SCO_WEBSRVRMG3, 598) +ip_port (ACP, 599) +ip_port (IPCSERVER, 600) +ip_port (SYSLOG_CONN, 601) +ip_port (XMLRPC_BEEP, 602) +ip_port (IDXP, 603) +ip_port (TUNNEL, 604) +ip_port (SOAP_BEEP, 605) +ip_port (URM, 606) +ip_port (NQS, 607) +ip_port (SIFT_UFT, 608) +ip_port (NPMP_TRAP, 609) +ip_port (NPMP_LOCAL, 610) +ip_port (NPMP_GUI, 611) +ip_port (HMMP_IND, 612) +ip_port (HMMP_OP, 613) +ip_port (SSHELL, 614) +ip_port (SCO_INETMGR, 615) +ip_port (SCO_SYSMGR, 616) +ip_port (SCO_DTMGR, 617) +ip_port (DEI_ICDA, 618) +ip_port (COMPAQ_EVM, 619) +ip_port (SCO_WEBSRVRMGR, 620) +ip_port (ESCP_IP, 621) +ip_port (COLLABORATOR, 622) +ip_port (ASF_RMCP, 623) +ip_port (CRYPTOADMIN, 624) +ip_port (DEC_DLM, 625) +ip_port (ASIA, 626) +ip_port (PASSGO_TIVOLI, 627) +ip_port (QMQP, 628) +ip_port (3COM_AMP3, 629) +ip_port (RDA, 630) +ip_port (IPP, 631) +ip_port (BMPP, 632) +ip_port (SERVSTAT, 633) +ip_port (GINAD, 634) +ip_port (RLZDBASE, 635) +ip_port (LDAPS, 636) +ip_port (LANSERVER, 637) +ip_port (MCNS_SEC, 638) +ip_port (MSDP, 639) +ip_port (ENTRUST_SPS, 640) +ip_port (REPCMD, 641) +ip_port (ESRO_EMSDP, 642) +ip_port (SANITY, 643) +ip_port (DWR, 644) +ip_port (PSSC, 645) +ip_port (LDP, 646) +ip_port (DHCP_FAILOVER, 647) +ip_port (RRP, 648) +ip_port (CADVIEW_3D, 649) +ip_port (OBEX, 650) +ip_port (IEEE_MMS, 651) +ip_port (HELLO_PORT, 652) +ip_port (REPSCMD, 653) +ip_port (AODV, 654) +ip_port (TINC, 655) +ip_port (SPMP, 656) +ip_port (RMC, 657) +ip_port (TENFOLD, 658) +ip_port (MAC_SRVR_ADMIN, 660) +ip_port (HAP, 661) +ip_port (PFTP, 662) +ip_port (PURENOISE, 663) +ip_port (ASF_SECURE_RMCP, 664) +ip_port (SUN_DR, 665) +ip_port (MDQS, 666) +ip_port (DOOM, 666) +ip_port (DISCLOSE, 667) +ip_port (MECOMM, 668) +ip_port (MEREGISTER, 669) +ip_port (VACDSM_SWS, 670) +ip_port (VACDSM_APP, 671) +ip_port (VPPS_QUA, 672) +ip_port (CIMPLEX, 673) +ip_port (ACAP, 674) +ip_port (DCTP, 675) +ip_port (VPPS_VIA, 676) +ip_port (VPP, 677) +ip_port (GGF_NCP, 678) +ip_port (MRM, 679) +ip_port (ENTRUST_AAAS, 680) +ip_port (ENTRUST_AAMS, 681) +ip_port (XFR, 682) +ip_port (CORBA_IIOP, 683) +ip_port (CORBA_IIOP_SSL, 684) +ip_port (MDC_PORTMAPPER, 685) +ip_port (HCP_WISMAR, 686) +ip_port (ASIPREGISTRY, 687) +ip_port (REALM_RUSD, 688) +ip_port (NMAP, 689) +ip_port (VATP, 690) +ip_port (MSEXCH_ROUTING, 691) +ip_port (HYPERWAVE_ISP, 692) +ip_port (CONNENDP, 693) +ip_port (HA_CLUSTER, 694) +ip_port (IEEE_MMS_SSL, 695) +ip_port (RUSHD, 696) +ip_port (UUIDGEN, 697) +ip_port (OLSR, 698) +ip_port (ACCESSNETWORK, 699) +ip_port (EPP, 700) +ip_port (LMP, 701) +ip_port (IRIS_BEEP, 702) +ip_port (ELCSD, 704) +ip_port (AGENTX, 705) +ip_port (SILC, 706) +ip_port (BORLAND_DSJ, 707) +ip_port (ENTRUST_KMSH, 709) +ip_port (ENTRUST_ASH, 710) +ip_port (CISCO_TDP, 711) +ip_port (TBRPF, 712) +ip_port (NETVIEWDM1, 729) +ip_port (NETVIEWDM2, 730) +ip_port (NETVIEWDM3, 731) +ip_port (NETGW, 741) +ip_port (NETRCS, 742) +ip_port (FLEXLM, 744) +ip_port (FUJITSU_DEV, 747) +ip_port (RIS_CM, 748) +ip_port (KERBEROS_ADM, 749) +ip_port (RFILE, 750) +ip_port (PUMP, 751) +ip_port (QRH, 752) +ip_port (RRH, 753) +ip_port (TELL, 754) +ip_port (NLOGIN, 758) +ip_port (CON, 759) +ip_port (NS, 760) +ip_port (RXE, 761) +ip_port (QUOTAD, 762) +ip_port (CYCLESERV, 763) +ip_port (OMSERV, 764) +ip_port (WEBSTER, 765) +ip_port (PHONEBOOK, 767) +ip_port (VID, 769) +ip_port (CADLOCK, 770) +ip_port (RTIP, 771) +ip_port (CYCLESERV2, 772) +ip_port (SUBMIT, 773) +ip_port (RPASSWD, 774) +ip_port (ENTOMB, 775) +ip_port (WPAGES, 776) +ip_port (MULTILING_HTTP, 777) +ip_port (WPGS, 780) +ip_port (MDBS_DAEMON, 800) +ip_port (DEVICE, 801) +ip_port (FCP_UDP, 810) +ip_port (ITM_MCELL_S, 828) +ip_port (PKIX_3_CA_RA, 829) +ip_port (DHCP_FAILOVER2, 847) +ip_port (GDOI, 848) +ip_port (ISCSI, 860) +ip_port (RSYNC, 873) +ip_port (ICLCNET_LOCATE, 886) +ip_port (ICLCNET_SVINFO, 887) +ip_port (ACCESSBUILDER, 888) +ip_port (CDDBP, 888) +ip_port (OMGINITIALREFS, 900) +ip_port (SMPNAMERES, 901) +ip_port (IDEAFARM_CHAT, 902) +ip_port (IDEAFARM_CATCH, 903) +ip_port (XACT_BACKUP, 911) +ip_port (APEX_MESH, 912) +ip_port (APEX_EDGE, 913) +ip_port (FTPS_DATA, 989) +ip_port (FTPS, 990) +ip_port (NAS, 991) +ip_port (TELNETS, 992) +ip_port (IMAPS, 993) +ip_port (IRCS, 994) +ip_port (POP3S, 995) +ip_port (VSINET, 996) +ip_port (MAITRD, 997) +ip_port (BUSBOY, 998) +ip_port (GARCON, 999) +ip_port (PUPROUTER, 999) +ip_port (CADLOCK2, 1000) +ip_port (SURF, 1010) + diff --git a/vnet/vnet/ip/protocols.def b/vnet/vnet/ip/protocols.def new file mode 100644 index 00000000000..77fab31da05 --- /dev/null +++ b/vnet/vnet/ip/protocols.def @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Emacs editing mode -*-C-*- + +From http://www.iana.org/assignments/protocol-numbers + +PROTOCOL NUMBERS + +(last updated 18 October 2004) + +In the Internet Protocol version 4 (IPv4) [RFC791] there is a field, +called "Protocol", to identify the next level protocol. This is an 8 +bit field. In Internet Protocol version 6 (IPv6) [RFC1883] this field +is called the "Next Header" field. +*/ +ip_protocol (0, IP6_HOP_BY_HOP_OPTIONS) +ip_protocol (1, ICMP) +ip_protocol (2, IGMP) +ip_protocol (3, GGP) +ip_protocol (4, IP_IN_IP) +ip_protocol (5, ST) +ip_protocol (6, TCP) +ip_protocol (7, CBT) +ip_protocol (8, EGP) +ip_protocol (9, IGP) +ip_protocol (10, BBN_RCC_MON) +ip_protocol (11, NVP_II) +ip_protocol (12, PUP) +ip_protocol (13, ARGUS) +ip_protocol (14, EMCON) +ip_protocol (15, XNET) +ip_protocol (16, CHAOS) +ip_protocol (17, UDP) +ip_protocol (18, MUX) +ip_protocol (19, DCN_MEAS) +ip_protocol (20, HMP) +ip_protocol (21, PRM) +ip_protocol (22, XNS_IDP) +ip_protocol (23, TRUNK_1) +ip_protocol (24, TRUNK_2) +ip_protocol (25, LEAF_1) +ip_protocol (26, LEAF_2) +ip_protocol (27, RDP) +ip_protocol (28, IRTP) +ip_protocol (29, ISO_TP4) +ip_protocol (30, NETBLT) +ip_protocol (31, MFE_NSP) +ip_protocol (32, MERIT_INP) +ip_protocol (33, SEP) +ip_protocol (34, 3PC) +ip_protocol (35, IDPR) +ip_protocol (36, XTP) +ip_protocol (37, DDP) +ip_protocol (38, IDPR_CMTP) +ip_protocol (39, TP) +ip_protocol (40, IL) +ip_protocol (41, IPV6) +ip_protocol (42, SDRP) +ip_protocol (43, IPV6_ROUTE) +ip_protocol (44, IPV6_FRAGMENTATION) +ip_protocol (45, IDRP) +ip_protocol (46, RSVP) +ip_protocol (47, GRE) +ip_protocol (48, MHRP) +ip_protocol (49, BNA) +ip_protocol (50, IPSEC_ESP) +ip_protocol (51, IPSEC_AH) +ip_protocol (52, I_NLSP) +ip_protocol (53, SWIPE) +ip_protocol (54, NARP) +ip_protocol (55, MOBILE) +ip_protocol (56, TLSP) +ip_protocol (57, SKIP) +ip_protocol (58, ICMP6) +ip_protocol (59, IP6_NONXT) +ip_protocol (60, IP6_DESTINATION_OPTIONS) +ip_protocol (62, CFTP) +ip_protocol (64, SAT_EXPAK) +ip_protocol (65, KRYPTOLAN) +ip_protocol (66, RVD) +ip_protocol (67, IPPC) +ip_protocol (69, SAT_MON) +ip_protocol (70, VISA) +ip_protocol (71, IPCV) +ip_protocol (72, CPNX) +ip_protocol (73, CPHB) +ip_protocol (74, WSN) +ip_protocol (75, PVP) +ip_protocol (76, BR_SAT_MON) +ip_protocol (77, SUN_ND) +ip_protocol (78, WB_MON) +ip_protocol (79, WB_EXPAK) +ip_protocol (80, ISO_IP) +ip_protocol (81, VMTP) +ip_protocol (82, SECURE_VMTP) +ip_protocol (83, VINES) +ip_protocol (84, TTP) +ip_protocol (85, NSFNET_IGP) +ip_protocol (86, DGP) +ip_protocol (87, TCF) +ip_protocol (88, EIGRP) +ip_protocol (89, OSPF) +ip_protocol (90, SPRITE_RPC) +ip_protocol (91, LARP) +ip_protocol (92, MTP) +ip_protocol (93, AX) +ip_protocol (94, IPIP) +ip_protocol (95, MICP) +ip_protocol (96, SCC_SP) +ip_protocol (97, ETHERIP) +ip_protocol (98, ENCAP) +ip_protocol (100, GMTP) +ip_protocol (101, IFMP) +ip_protocol (102, PNNI) +ip_protocol (103, PIM) +ip_protocol (104, ARIS) +ip_protocol (105, SCPS) +ip_protocol (106, QNX) +ip_protocol (107, A) +ip_protocol (108, IPCOMP) +ip_protocol (109, SNP) +ip_protocol (110, COMPAQ_PEER) +ip_protocol (111, IPX_IN_IP) +ip_protocol (112, VRRP) +ip_protocol (113, PGM) +ip_protocol (115, L2TP) +ip_protocol (116, DDX) +ip_protocol (117, IATP) +ip_protocol (118, STP) +ip_protocol (119, SRP) +ip_protocol (120, UTI) +ip_protocol (121, SMP) +ip_protocol (122, SM) +ip_protocol (123, PTP) +ip_protocol (124, ISIS) +ip_protocol (125, FIRE) +ip_protocol (126, CRTP) +ip_protocol (127, CRUDP) +ip_protocol (128, SSCOPMCE) +ip_protocol (129, IPLT) +ip_protocol (130, SPS) +ip_protocol (131, PIPE) +ip_protocol (132, SCTP) +ip_protocol (133, FC) +ip_protocol (134, RSVP_E2E_IGNORE) +ip_protocol (135, MOBILITY) +ip_protocol (136, UDP_LITE) +ip_protocol (137, MPLS_IN_IP) +ip_protocol (255, RESERVED) + diff --git a/vnet/vnet/ip/tcp.c b/vnet/vnet/ip/tcp.c new file mode 100644 index 00000000000..53f82f1c5b9 --- /dev/null +++ b/vnet/vnet/ip/tcp.c @@ -0,0 +1,2983 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/tcp.c: tcp protocol + * + * Copyright (c) 2011 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> +#include <vnet/ip/tcp.h> +#include <math.h> + +static u8 my_zero_mask_table[256] = { + [0xf0] = (1 << 1), + [0x0f] = (1 << 0), + [0xff] = (1 << 0) | (1 << 1), +}; + +static_always_inline u32 my_zero_mask (u32 x) +{ + return ((my_zero_mask_table[(x >> 0) & 0xff] << 0) + | (my_zero_mask_table[(x >> 8) & 0xff] << 2)); +} + +static u8 my_first_set_table[256] = { + [0x00] = 4, + [0xf0] = 1, + [0x0f] = 0, + [0xff] = 0, +}; + +static_always_inline u32 my_first_set (u32 zero_mask) +{ + u8 r0 = my_first_set_table[(zero_mask >> 0) & 0xff]; + u8 r1 = 2 + my_first_set_table[(zero_mask >> 8) & 0xff]; + return r0 != 4 ? r0 : r1; +} + +static_always_inline void +ip4_tcp_udp_address_x4_set_from_headers (ip4_tcp_udp_address_x4_t * a, + ip4_header_t * ip, + tcp_header_t * tcp, + u32 i) +{ + a->src.as_ip4_address[i] = ip->src_address; + a->dst.as_ip4_address[i] = ip->dst_address; + a->ports.as_ports[i].as_u32 = tcp->ports.src_and_dst; +} + +static_always_inline void +ip4_tcp_udp_address_x4_copy_and_invalidate (ip4_tcp_udp_address_x4_t * dst, + ip4_tcp_udp_address_x4_t * src, + u32 dst_i, u32 src_i) +{ +#define _(d,s) d = s; s = 0; + _ (dst->src.as_ip4_address[dst_i].as_u32, src->src.as_ip4_address[src_i].as_u32); + _ (dst->dst.as_ip4_address[dst_i].as_u32, src->dst.as_ip4_address[src_i].as_u32); + _ (dst->ports.as_ports[dst_i].as_u32, src->ports.as_ports[src_i].as_u32); +#undef _ +} + +static_always_inline void +ip4_tcp_udp_address_x4_invalidate (ip4_tcp_udp_address_x4_t * a, u32 i) +{ + a->src.as_ip4_address[i].as_u32 = 0; + a->dst.as_ip4_address[i].as_u32 = 0; + a->ports.as_ports[i].as_u32 = 0; +} + +static_always_inline uword +ip4_tcp_udp_address_x4_is_valid (ip4_tcp_udp_address_x4_t * a, u32 i) +{ + return !(a->src.as_ip4_address[i].as_u32 == 0 + && a->dst.as_ip4_address[i].as_u32 == 0 + && a->ports.as_ports[i].as_u32 == 0); +} + +#ifdef TCP_HAVE_VEC128 +static_always_inline uword +ip4_tcp_udp_address_x4_match_helper (ip4_tcp_udp_address_x4_t * ax4, + u32x4 src, u32x4 dst, u32x4 ports) +{ + u32x4 r; + u32 m; + + r = u32x4_is_equal (src, ax4->src.as_u32x4); + r &= u32x4_is_equal (dst, ax4->dst.as_u32x4); + r &= u32x4_is_equal (ports, ax4->ports.as_u32x4); + + /* At this point r will be either all zeros (if nothing matched) + or have 32 1s in the position that did match. */ + m = u8x16_compare_byte_mask ((u8x16) r); + + return m; +} + +static_always_inline uword +ip4_tcp_udp_address_x4_match (ip4_tcp_udp_address_x4_t * ax4, + ip4_header_t * ip, + tcp_header_t * tcp) +{ + u32x4 src = u32x4_splat (ip->src_address.as_u32); + u32x4 dst = u32x4_splat (ip->dst_address.as_u32); + u32x4 ports = u32x4_splat (tcp->ports.src_and_dst); + return my_first_set (ip4_tcp_udp_address_x4_match_helper (ax4, src, dst, ports)); +} + +static_always_inline uword +ip4_tcp_udp_address_x4_first_empty (ip4_tcp_udp_address_x4_t * ax4) +{ + u32x4 zero = {0}; + return my_first_set (ip4_tcp_udp_address_x4_match_helper (ax4, zero, zero, zero)); +} + +static_always_inline uword +ip4_tcp_udp_address_x4_empty_mask (ip4_tcp_udp_address_x4_t * ax4) +{ + u32x4 zero = {0}; + return my_zero_mask (ip4_tcp_udp_address_x4_match_helper (ax4, zero, zero, zero)); +} +#else /* TCP_HAVE_VEC128 */ +static_always_inline uword +ip4_tcp_udp_address_x4_match_helper (ip4_tcp_udp_address_x4_t * ax4, + u32 src, u32 dst, u32 ports) +{ + u32 r0, r1, r2, r3; + +#define _(i) \ + r##i = (src == ax4->src.as_ip4_address[i].as_u32 \ + && dst == ax4->dst.as_ip4_address[i].as_u32 \ + && ports == ax4->ports.as_ports[i].as_u32) + + _ (0); + _ (1); + _ (2); + _ (3); + +#undef _ + + return (((r0 ? 0xf : 0x0) << 0) + | ((r1 ? 0xf : 0x0) << 4) + | ((r2 ? 0xf : 0x0) << 8) + | ((r3 ? 0xf : 0x0) << 12)); +} + +static_always_inline uword +ip4_tcp_udp_address_x4_match (ip4_tcp_udp_address_x4_t * ax4, + ip4_header_t * ip, + tcp_header_t * tcp) +{ + return my_first_set (ip4_tcp_udp_address_x4_match_helper (ax4, + ip->src_address.as_u32, + ip->dst_address.as_u32, + tcp->ports.src_and_dst)); +} + +static_always_inline uword +ip4_tcp_udp_address_x4_first_empty (ip4_tcp_udp_address_x4_t * ax4) +{ + return my_first_set (ip4_tcp_udp_address_x4_match_helper (ax4, 0, 0, 0)); +} + +static_always_inline uword +ip4_tcp_udp_address_x4_empty_mask (ip4_tcp_udp_address_x4_t * ax4) +{ + return my_zero_mask (ip4_tcp_udp_address_x4_match_helper (ax4, 0, 0, 0)); +} +#endif + +static u8 * format_ip4_tcp_udp_address_x4 (u8 * s, va_list * va) +{ + ip4_tcp_udp_address_x4_t * a = va_arg (*va, ip4_tcp_udp_address_x4_t *); + u32 ai = va_arg (*va, u32); + ASSERT (ai < 4); + + s = format (s, "%U:%d -> %U:%d", + format_ip4_address, &a->src.as_ip4_address[ai], + clib_net_to_host_u16 (a->ports.as_ports[ai].src), + format_ip4_address, &a->dst.as_ip4_address[ai], + clib_net_to_host_u16 (a->ports.as_ports[ai].dst)); + + return s; +} + +static_always_inline void +ip6_tcp_udp_address_x4_set_from_headers (ip6_tcp_udp_address_x4_t * a, + ip6_header_t * ip, + tcp_header_t * tcp, + u32 i) +{ + a->src.as_u32[0][i] = ip->src_address.as_u32[0]; + a->src.as_u32[1][i] = ip->src_address.as_u32[1]; + a->src.as_u32[2][i] = ip->src_address.as_u32[2]; + a->src.as_u32[3][i] = ip->src_address.as_u32[3]; + a->dst.as_u32[0][i] = ip->dst_address.as_u32[0]; + a->dst.as_u32[1][i] = ip->dst_address.as_u32[1]; + a->dst.as_u32[2][i] = ip->dst_address.as_u32[2]; + a->dst.as_u32[3][i] = ip->dst_address.as_u32[3]; + a->ports.as_ports[i].as_u32 = tcp->ports.src_and_dst; +} + +static_always_inline void +ip6_tcp_udp_address_x4_copy_and_invalidate (ip6_tcp_udp_address_x4_t * dst, + ip6_tcp_udp_address_x4_t * src, + u32 dst_i, u32 src_i) +{ +#define _(d,s) d = s; s = 0; + _ (dst->src.as_u32[0][dst_i], src->src.as_u32[0][src_i]); + _ (dst->src.as_u32[1][dst_i], src->src.as_u32[1][src_i]); + _ (dst->src.as_u32[2][dst_i], src->src.as_u32[2][src_i]); + _ (dst->src.as_u32[3][dst_i], src->src.as_u32[3][src_i]); + _ (dst->dst.as_u32[0][dst_i], src->dst.as_u32[0][src_i]); + _ (dst->dst.as_u32[1][dst_i], src->dst.as_u32[1][src_i]); + _ (dst->dst.as_u32[2][dst_i], src->dst.as_u32[2][src_i]); + _ (dst->dst.as_u32[3][dst_i], src->dst.as_u32[3][src_i]); + _ (dst->ports.as_ports[dst_i].as_u32, src->ports.as_ports[src_i].as_u32); +#undef _ +} + +static_always_inline void +ip6_tcp_udp_address_x4_invalidate (ip6_tcp_udp_address_x4_t * a, u32 i) +{ + a->src.as_u32[0][i] = 0; + a->src.as_u32[1][i] = 0; + a->src.as_u32[2][i] = 0; + a->src.as_u32[3][i] = 0; + a->dst.as_u32[0][i] = 0; + a->dst.as_u32[1][i] = 0; + a->dst.as_u32[2][i] = 0; + a->dst.as_u32[3][i] = 0; + a->ports.as_ports[i].as_u32 = 0; +} + +static_always_inline uword +ip6_tcp_udp_address_x4_is_valid (ip6_tcp_udp_address_x4_t * a, u32 i) +{ + return !(a->src.as_u32[0][i] == 0 + && a->src.as_u32[1][i] == 0 + && a->src.as_u32[2][i] == 0 + && a->src.as_u32[3][i] == 0 + && a->dst.as_u32[0][i] == 0 + && a->dst.as_u32[1][i] == 0 + && a->dst.as_u32[2][i] == 0 + && a->dst.as_u32[3][i] == 0 + && a->ports.as_ports[i].as_u32 == 0); +} + +#ifdef TCP_HAVE_VEC128 +static_always_inline uword +ip6_tcp_udp_address_x4_match_helper (ip6_tcp_udp_address_x4_t * ax4, + u32x4 src0, u32x4 src1, u32x4 src2, u32x4 src3, + u32x4 dst0, u32x4 dst1, u32x4 dst2, u32x4 dst3, + u32x4 ports) +{ + u32x4 r; + u32 m; + + r = u32x4_is_equal (src0, ax4->src.as_u32x4[0]); + r &= u32x4_is_equal (src1, ax4->src.as_u32x4[1]); + r &= u32x4_is_equal (src2, ax4->src.as_u32x4[2]); + r &= u32x4_is_equal (src3, ax4->src.as_u32x4[3]); + r &= u32x4_is_equal (dst0, ax4->dst.as_u32x4[0]); + r &= u32x4_is_equal (dst1, ax4->dst.as_u32x4[1]); + r &= u32x4_is_equal (dst2, ax4->dst.as_u32x4[2]); + r &= u32x4_is_equal (dst3, ax4->dst.as_u32x4[3]); + r &= u32x4_is_equal (ports, ax4->ports.as_u32x4); + + /* At this point r will be either all zeros (if nothing matched) + or have 32 1s in the position that did match. */ + m = u8x16_compare_byte_mask ((u8x16) r); + + return m; +} + +static_always_inline uword +ip6_tcp_udp_address_x4_match (ip6_tcp_udp_address_x4_t * ax4, + ip6_header_t * ip, + tcp_header_t * tcp) +{ + u32x4 src0 = u32x4_splat (ip->src_address.as_u32[0]); + u32x4 src1 = u32x4_splat (ip->src_address.as_u32[1]); + u32x4 src2 = u32x4_splat (ip->src_address.as_u32[2]); + u32x4 src3 = u32x4_splat (ip->src_address.as_u32[3]); + u32x4 dst0 = u32x4_splat (ip->dst_address.as_u32[0]); + u32x4 dst1 = u32x4_splat (ip->dst_address.as_u32[1]); + u32x4 dst2 = u32x4_splat (ip->dst_address.as_u32[2]); + u32x4 dst3 = u32x4_splat (ip->dst_address.as_u32[3]); + u32x4 ports = u32x4_splat (tcp->ports.src_and_dst); + return my_first_set (ip6_tcp_udp_address_x4_match_helper (ax4, + src0, src1, src2, src3, + dst0, dst1, dst2, dst3, + ports)); +} + +static_always_inline uword +ip6_tcp_udp_address_x4_first_empty (ip6_tcp_udp_address_x4_t * ax4) +{ + u32x4 zero = {0}; + return my_first_set (ip6_tcp_udp_address_x4_match_helper (ax4, + zero, zero, zero, zero, + zero, zero, zero, zero, + zero)); +} + +static_always_inline uword +ip6_tcp_udp_address_x4_empty_mask (ip6_tcp_udp_address_x4_t * ax4) +{ + u32x4 zero = {0}; + return my_zero_mask (ip6_tcp_udp_address_x4_match_helper (ax4, + zero, zero, zero, zero, + zero, zero, zero, zero, + zero)); +} +#else /* TCP_HAVE_VEC128 */ +static_always_inline uword +ip6_tcp_udp_address_x4_match_helper (ip6_tcp_udp_address_x4_t * ax4, + u32 src0, u32 src1, u32 src2, u32 src3, + u32 dst0, u32 dst1, u32 dst2, u32 dst3, + u32 ports) +{ + u32 r0, r1, r2, r3; + +#define _(i) \ + r##i = (src0 == ax4->src.as_u32[i][0] \ + && src1 == ax4->src.as_u32[i][1] \ + && src2 == ax4->src.as_u32[i][2] \ + && src3 == ax4->src.as_u32[i][3] \ + && dst0 == ax4->dst.as_u32[i][0] \ + && dst1 == ax4->dst.as_u32[i][1] \ + && dst2 == ax4->dst.as_u32[i][2] \ + && dst3 == ax4->dst.as_u32[i][3] \ + && ports == ax4->ports.as_ports[i].as_u32) + + _ (0); + _ (1); + _ (2); + _ (3); + +#undef _ + + return (((r0 ? 0xf : 0x0) << 0) + | ((r1 ? 0xf : 0x0) << 4) + | ((r2 ? 0xf : 0x0) << 8) + | ((r3 ? 0xf : 0x0) << 12)); +} + +static_always_inline uword +ip6_tcp_udp_address_x4_match (ip6_tcp_udp_address_x4_t * ax4, + ip6_header_t * ip, + tcp_header_t * tcp) +{ + u32 src0 = ip->src_address.as_u32[0]; + u32 src1 = ip->src_address.as_u32[1]; + u32 src2 = ip->src_address.as_u32[2]; + u32 src3 = ip->src_address.as_u32[3]; + u32 dst0 = ip->dst_address.as_u32[0]; + u32 dst1 = ip->dst_address.as_u32[1]; + u32 dst2 = ip->dst_address.as_u32[2]; + u32 dst3 = ip->dst_address.as_u32[3]; + u32 ports = tcp->ports.src_and_dst; + return my_first_set (ip6_tcp_udp_address_x4_match_helper (ax4, + src0, src1, src2, src3, + dst0, dst1, dst2, dst3, + ports)); +} + +static_always_inline uword +ip6_tcp_udp_address_x4_first_empty (ip6_tcp_udp_address_x4_t * ax4) +{ + return my_first_set (ip6_tcp_udp_address_x4_match_helper (ax4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0)); +} + +static_always_inline uword +ip6_tcp_udp_address_x4_empty_mask (ip6_tcp_udp_address_x4_t * ax4) +{ + return my_zero_mask (ip6_tcp_udp_address_x4_match_helper (ax4, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0)); +} +#endif /* ! TCP_HAVE_VEC128 */ + +static u8 * format_ip6_tcp_udp_address_x4 (u8 * s, va_list * va) +{ + ip6_tcp_udp_address_x4_t * a = va_arg (*va, ip6_tcp_udp_address_x4_t *); + u32 i, ai = va_arg (*va, u32); + ip6_address_t src, dst; + + ASSERT (ai < 4); + for (i = 0; i < 4; i++) + { + src.as_u32[i] = a->src.as_u32[i][ai]; + dst.as_u32[i] = a->dst.as_u32[i][ai]; + } + + s = format (s, "%U:%d -> %U:%d", + format_ip6_address, &src, + clib_net_to_host_u16 (a->ports.as_ports[ai].src), + format_ip6_address, &dst, + clib_net_to_host_u16 (a->ports.as_ports[ai].dst)); + + return s; +} + +static_always_inline u32 +find_oldest_timestamp_x4 (u32 * time_stamps, u32 now) +{ + u32 dt0, dt_min0, i_min0; + u32 dt1, dt_min1, i_min1; + + i_min0 = i_min1 = 0; + dt_min0 = now - time_stamps[0]; + dt_min1 = now - time_stamps[2]; + dt0 = now - time_stamps[1]; + dt1 = now - time_stamps[3]; + + i_min0 += dt0 > dt_min0; + i_min1 += dt1 > dt_min1; + + dt_min0 = i_min0 > 0 ? dt0 : dt_min0; + dt_min1 = i_min1 > 0 ? dt1 : dt_min1; + + return dt_min0 > dt_min1 ? i_min0 : (2 + i_min1); +} + +static_always_inline uword +tcp_round_trip_time_stats_is_valid (tcp_round_trip_time_stats_t * s) +{ return s->count > 0; } + +static_always_inline void +tcp_round_trip_time_stats_compute (tcp_round_trip_time_stats_t * s, f64 * r) +{ + f64 ave, rms; + ASSERT (s->count > 0); + ave = s->sum / s->count; + rms = sqrt (s->sum2 / s->count - ave*ave); + r[0] = ave; + r[1] = rms; +} + +typedef struct { + tcp_option_type_t type : 8; + u8 length; + u32 my_time_stamp, his_time_stamp; +} __attribute__ ((packed)) tcp_time_stamp_option_t; + +typedef struct { + tcp_header_t header; + + struct { + struct { + tcp_option_type_t type : 8; + u8 length; + u16 value; + } mss; + + struct { + tcp_option_type_t type : 8; + u8 length; + u8 value; + } __attribute__ ((packed)) window_scale; + + u8 nops[3]; + + tcp_time_stamp_option_t time_stamp; + } __attribute__ ((packed)) options; +} __attribute__ ((packed)) tcp_syn_packet_t; + +typedef struct { + tcp_header_t header; + + struct { + u8 nops[2]; + + tcp_time_stamp_option_t time_stamp; + } options; +} __attribute__ ((packed)) tcp_ack_packet_t; + +typedef struct { + ip4_header_t ip4; + tcp_syn_packet_t tcp; +} ip4_tcp_syn_packet_t; + +typedef struct { + ip4_header_t ip4; + tcp_ack_packet_t tcp; +} ip4_tcp_ack_packet_t; + +typedef struct { + ip6_header_t ip6; + tcp_syn_packet_t tcp; +} ip6_tcp_syn_packet_t; + +typedef struct { + ip6_header_t ip6; + tcp_ack_packet_t tcp; +} ip6_tcp_ack_packet_t; + +static_always_inline void +ip4_tcp_packet_init (ip4_header_t * ip, u32 n_bytes) +{ + ip->ip_version_and_header_length = 0x45; + + ip->tos = ip4_main.host_config.tos; + ip->ttl = ip4_main.host_config.ttl; + + /* No need to set fragment ID due to DF bit. */ + ip->flags_and_fragment_offset = clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT); + + ip->protocol = IP_PROTOCOL_TCP; + + ip->length = clib_host_to_net_u16 (n_bytes); + + ip->checksum = ip4_header_checksum (ip); +} + +static_always_inline void +ip6_tcp_packet_init (ip6_header_t * ip, u32 n_bytes) +{ + ip->ip_version_traffic_class_and_flow_label = clib_host_to_net_u32 (0x6 << 28); + + ip->payload_length = clib_host_to_net_u16 (n_bytes - sizeof (ip[0])); + + ip->hop_limit = ip6_main.host_config.ttl; +} + +static_always_inline u32 +tcp_time_now (tcp_main_t * tm, tcp_timer_type_t t) +{ + ASSERT (t < ARRAY_LEN (tm->log2_clocks_per_tick)); + return clib_cpu_time_now () >> tm->log2_clocks_per_tick[t]; +} + +static void +tcp_time_init (vlib_main_t * vm, tcp_main_t * tm) +{ + int i; + f64 log2 = .69314718055994530941; + + for (i = 0; i < ARRAY_LEN (tm->log2_clocks_per_tick); i++) + { + static f64 t[] = { +#define _(f,r) r, + foreach_tcp_timer +#undef _ + }; + tm->log2_clocks_per_tick[i] = + flt_round_nearest (log (t[i] / vm->clib_time.seconds_per_clock) / log2); + tm->secs_per_tick[i] = vm->clib_time.seconds_per_clock * (1 << tm->log2_clocks_per_tick[i]); + } +} + +tcp_main_t tcp_main; + +typedef enum { + TCP_LOOKUP_NEXT_DROP, + TCP_LOOKUP_NEXT_PUNT, + TCP_LOOKUP_NEXT_LISTEN_SYN, + TCP_LOOKUP_NEXT_LISTEN_ACK, + TCP_LOOKUP_NEXT_CONNECT_SYN_ACK, + TCP_LOOKUP_NEXT_ESTABLISHED, + TCP_LOOKUP_N_NEXT, +} tcp_lookup_next_t; + +#define foreach_tcp_error \ + _ (NONE, "no error") \ + _ (LOOKUP_DROPS, "lookup drops") \ + _ (LISTEN_RESPONSES, "listen responses sent") \ + _ (CONNECTS_SENT, "connects sent") \ + _ (LISTENS_ESTABLISHED, "listens connected") \ + _ (UNEXPECTED_SEQ_NUMBER, "unexpected sequence number drops") \ + _ (UNEXPECTED_ACK_NUMBER, "unexpected acknowledgment number drops") \ + _ (CONNECTS_ESTABLISHED, "connects established") \ + _ (NO_LISTENER_FOR_PORT, "no listener for port") \ + _ (WRONG_LOCAL_ADDRESS_FOR_PORT, "wrong local address for port") \ + _ (ACKS_SENT, "acks sent for established connections") \ + _ (NO_DATA, "acks with no data") \ + _ (FINS_RECEIVED, "fins received") \ + _ (SEGMENT_AFTER_FIN, "segments dropped after fin received") \ + _ (CONNECTIONS_CLOSED, "connections closed") + +typedef enum { +#define _(sym,str) TCP_ERROR_##sym, + foreach_tcp_error +#undef _ + TCP_N_ERROR, +} tcp_error_t; + +#ifdef TCP_HAVE_VEC128 +static_always_inline u32x4 u32x4_splat_x2 (u32 x) +{ + u32x4 r = u32x4_set0 (x); + return u32x4_interleave_lo (r, r); +} + +static_always_inline u32x4 u32x4_set_x2 (u32 x, u32 y) +{ + u32x4 r0 = u32x4_set0 (x); + u32x4 r1 = u32x4_set0 (y); + return u32x4_interleave_lo (r0, r1); +} + +/* FIXME */ +#define u32x4_get(x,i) \ + __builtin_ia32_vec_ext_v4si ((i32x4) (x), (int) (i)) +#else /* TCP_HAVE_VEC128 */ +#endif /* TCP_HAVE_VEC128 */ + +/* Dispatching on tcp/udp listeners (by dst port) + and tcp/udp connections (by src/dst address/port). */ +static_always_inline uword +ip46_tcp_lookup (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + uword is_ip6) +{ + tcp_main_t * tm = &tcp_main; + ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + uword n_packets = frame->n_vectors; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next, mini_now; + vlib_node_runtime_t * error_node = node; + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + next = node->cached_next_index; + mini_now = tcp_time_now (tm, TCP_TIMER_mini_connection); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip60; + ip4_header_t * ip40; + tcp_header_t * tcp0; + u32 bi0, imin0, iest0, li0; + tcp_connection_state_t state0; + u8 error0, next0; + u8 min_match0, est_match0, is_min_match0, is_est_match0; + u8 min_oldest0, est_first_empty0; + + bi0 = to_next[0] = from[0]; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + +#ifdef TCP_HAVE_VEC128 + { + u32x4 a0, b0, c0; + + a0 = tm->connection_hash_seeds[is_ip6][0].as_u32x4; + b0 = tm->connection_hash_seeds[is_ip6][1].as_u32x4; + c0 = tm->connection_hash_seeds[is_ip6][2].as_u32x4; + + if (is_ip6) + { + ip60 = vlib_buffer_get_current (p0); + tcp0 = ip6_next_header (ip60); + + a0 ^= u32x4_splat_x2 (ip60->src_address.as_u32[0]); + b0 ^= u32x4_splat_x2 (ip60->src_address.as_u32[1]); + c0 ^= u32x4_splat_x2 (ip60->src_address.as_u32[2]); + + hash_v3_mix_u32x (a0, b0, c0); + + a0 ^= u32x4_splat_x2 (ip60->src_address.as_u32[3]); + b0 ^= u32x4_splat_x2 (ip60->dst_address.as_u32[0]); + c0 ^= u32x4_splat_x2 (ip60->dst_address.as_u32[1]); + + hash_v3_mix_u32x (a0, b0, c0); + + a0 ^= u32x4_splat_x2 (ip60->dst_address.as_u32[2]); + b0 ^= u32x4_splat_x2 (ip60->dst_address.as_u32[3]); + c0 ^= u32x4_splat_x2 (tcp0->ports.src_and_dst); + } + else + { + ip40 = vlib_buffer_get_current (p0); + tcp0 = ip4_next_header (ip40); + + a0 ^= u32x4_splat_x2 (ip40->src_address.as_u32); + b0 ^= u32x4_splat_x2 (ip40->dst_address.as_u32); + c0 ^= u32x4_splat_x2 (tcp0->ports.src_and_dst); + } + + hash_v3_finalize_u32x (a0, b0, c0); + + c0 &= tm->connection_hash_masks[is_ip6].as_u32x4; + + imin0 = u32x4_get0 (c0); + iest0 = u32x4_get (c0, 1); + } +#else + { + u32 a00, a01, b00, b01, c00, c01; + + a00 = tm->connection_hash_seeds[is_ip6][0].as_u32[0]; + a01 = tm->connection_hash_seeds[is_ip6][0].as_u32[1]; + b00 = tm->connection_hash_seeds[is_ip6][1].as_u32[0]; + b01 = tm->connection_hash_seeds[is_ip6][1].as_u32[1]; + c00 = tm->connection_hash_seeds[is_ip6][2].as_u32[0]; + c01 = tm->connection_hash_seeds[is_ip6][2].as_u32[1]; + + if (is_ip6) + { + ip60 = vlib_buffer_get_current (p0); + tcp0 = ip6_next_header (ip60); + + a00 ^= ip60->src_address.as_u32[0]; + a01 ^= ip60->src_address.as_u32[0]; + b00 ^= ip60->src_address.as_u32[1]; + b01 ^= ip60->src_address.as_u32[1]; + c00 ^= ip60->src_address.as_u32[2]; + c01 ^= ip60->src_address.as_u32[2]; + + hash_v3_mix32 (a00, b00, c00); + hash_v3_mix32 (a01, b01, c01); + + a00 ^= ip60->src_address.as_u32[3]; + a01 ^= ip60->src_address.as_u32[3]; + b00 ^= ip60->dst_address.as_u32[0]; + b01 ^= ip60->dst_address.as_u32[0]; + c00 ^= ip60->dst_address.as_u32[1]; + c01 ^= ip60->dst_address.as_u32[1]; + + hash_v3_mix32 (a00, b00, c00); + hash_v3_mix32 (a01, b01, c01); + + a00 ^= ip60->dst_address.as_u32[2]; + a01 ^= ip60->dst_address.as_u32[2]; + b00 ^= ip60->dst_address.as_u32[3]; + b01 ^= ip60->dst_address.as_u32[3]; + c00 ^= tcp0->ports.src_and_dst; + c01 ^= tcp0->ports.src_and_dst; + } + else + { + ip40 = vlib_buffer_get_current (p0); + tcp0 = ip4_next_header (ip40); + + a00 ^= ip40->src_address.as_u32; + a01 ^= ip40->src_address.as_u32; + b00 ^= ip40->dst_address.as_u32; + b01 ^= ip40->dst_address.as_u32; + c00 ^= tcp0->ports.src_and_dst; + c01 ^= tcp0->ports.src_and_dst; + } + + hash_v3_finalize32 (a00, b00, c00); + hash_v3_finalize32 (a01, b01, c01); + + c00 &= tm->connection_hash_masks[is_ip6].as_u32[0]; + c01 &= tm->connection_hash_masks[is_ip6].as_u32[1]; + + imin0 = c00; + iest0 = c01; + } +#endif + + if (is_ip6) + { + ip6_tcp_udp_address_x4_and_timestamps_t * mina0; + ip6_tcp_udp_address_x4_t * esta0; + + mina0 = vec_elt_at_index (tm->ip6_mini_connection_address_hash, imin0); + esta0 = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest0); + + min_match0 = ip6_tcp_udp_address_x4_match (&mina0->address_x4, ip60, tcp0); + est_match0 = ip6_tcp_udp_address_x4_match (esta0, ip60, tcp0); + + min_oldest0 = find_oldest_timestamp_x4 (mina0->time_stamps, mini_now); + est_first_empty0 = ip6_tcp_udp_address_x4_first_empty (esta0); + + if (PREDICT_FALSE (! est_match0 && est_first_empty0 >= 4 && ! min_match0)) + { + /* Lookup in overflow hash. */ + ASSERT (0); + } + } + else + { + ip4_tcp_udp_address_x4_and_timestamps_t * mina0; + ip4_tcp_udp_address_x4_t * esta0; + + mina0 = vec_elt_at_index (tm->ip4_mini_connection_address_hash, imin0); + esta0 = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest0); + + min_match0 = ip4_tcp_udp_address_x4_match (&mina0->address_x4, ip40, tcp0); + est_match0 = ip4_tcp_udp_address_x4_match (esta0, ip40, tcp0); + + min_oldest0 = find_oldest_timestamp_x4 (mina0->time_stamps, mini_now); + est_first_empty0 = ip4_tcp_udp_address_x4_first_empty (esta0); + + if (PREDICT_FALSE (! est_match0 && est_first_empty0 >= 4 && ! min_match0)) + { + /* Lookup in overflow hash. */ + ASSERT (0); + } + } + + is_min_match0 = min_match0 < 4; + is_est_match0 = est_match0 < 4; + + imin0 = 4 * imin0 + (is_min_match0 ? min_match0 : min_oldest0); + iest0 = 4 * iest0 + (is_est_match0 ? est_match0 : est_first_empty0); + + /* Should simultaneously not match both in mini and established connection tables. */ + ASSERT (! (is_min_match0 && is_est_match0)); + + { + tcp_mini_connection_t * min0; + tcp_connection_t * est0; + tcp_sequence_pair_t * seq_pair0; + u8 flags0; + + min0 = vec_elt_at_index (tm46->mini_connections, imin0); + est0 = vec_elt_at_index (tm46->established_connections, iest0); + + if (min_match0 < 4) + { + ASSERT (min0->state != TCP_CONNECTION_STATE_unused); + ASSERT (min0->state != TCP_CONNECTION_STATE_established); + } + + seq_pair0 = is_min_match0 ? &min0->sequence_numbers : &est0->sequence_numbers; + + state0 = is_min_match0 ? min0->state : TCP_CONNECTION_STATE_unused; + state0 = is_est_match0 ? TCP_CONNECTION_STATE_established : state0; + + vnet_buffer (p0)->ip.tcp.established_connection_index = iest0; + vnet_buffer (p0)->ip.tcp.mini_connection_index = imin0; + vnet_buffer (p0)->ip.tcp.listener_index = li0 = tm->listener_index_by_dst_port[tcp0->ports.dst]; + + flags0 = tcp0->flags & (TCP_FLAG_SYN | TCP_FLAG_ACK | TCP_FLAG_RST | TCP_FLAG_FIN); + + next0 = tm->disposition_by_state_and_flags[state0][flags0].next; + error0 = tm->disposition_by_state_and_flags[state0][flags0].error; + + next0 = li0 != 0 ? next0 : TCP_LOOKUP_NEXT_PUNT; + error0 = li0 != 0 ? error0 : TCP_ERROR_NO_LISTENER_FOR_PORT; + } + + p0->error = error_node->errors[error0]; + + if (PREDICT_FALSE (next0 != next)) + { + to_next -= 1; + n_left_to_next += 1; + + vlib_put_next_frame (vm, node, next, n_left_to_next); + + next = next0; + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + } + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + if (node->flags & VLIB_NODE_FLAG_TRACE) + /* FIXME */ ; + + return frame->n_vectors; +} + +static uword +ip4_tcp_lookup (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_lookup (vm, node, frame, /* is_ip6 */ 0); } + +static uword +ip6_tcp_lookup (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_lookup (vm, node, frame, /* is_ip6 */ 1); } + +static void +ip46_size_hash_tables (ip46_tcp_main_t * m) +{ + m->mini_connection_hash_mask = pow2_mask (m->log2_n_mini_connection_hash_elts); + vec_validate_aligned (m->mini_connections, + m->mini_connection_hash_mask, + CLIB_CACHE_LINE_BYTES); + + m->established_connection_hash_mask = pow2_mask (m->log2_n_established_connection_hash_elts); + vec_validate_aligned (m->established_connections, + m->established_connection_hash_mask, + CLIB_CACHE_LINE_BYTES); +} + +static void +ip46_tcp_lookup_init (vlib_main_t * vm, tcp_main_t * tm, int is_ip6) +{ + ip46_tcp_main_t * m = is_ip6 ? &tm->ip6 : &tm->ip4; + + m->is_ip6 = is_ip6; + + m->log2_n_mini_connection_hash_elts = 8; + m->log2_n_established_connection_hash_elts = 8; + ip46_size_hash_tables (m); + + if (is_ip6) + { + vec_validate_aligned (tm->ip6_mini_connection_address_hash, + m->mini_connection_hash_mask / 4, + CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (tm->ip6_established_connection_address_hash, + m->established_connection_hash_mask / 4, + CLIB_CACHE_LINE_BYTES); + } + else + { + vec_validate_aligned (tm->ip4_mini_connection_address_hash, + m->mini_connection_hash_mask / 4, + CLIB_CACHE_LINE_BYTES); + vec_validate_aligned (tm->ip4_established_connection_address_hash, + m->established_connection_hash_mask / 4, + CLIB_CACHE_LINE_BYTES); + } + tm->connection_hash_masks[is_ip6].as_u32[0] = m->mini_connection_hash_mask / 4; + tm->connection_hash_masks[is_ip6].as_u32[1] = m->established_connection_hash_mask / 4; +} + +static void +tcp_lookup_init (vlib_main_t * vm, tcp_main_t * tm) +{ + int is_ip6; + + /* Initialize hash seeds. */ + for (is_ip6 = 0; is_ip6 < 2; is_ip6++) + { + u32 * r = clib_random_buffer_get_data (&vm->random_buffer, 3 * 2 * sizeof (r[0])); + tm->connection_hash_seeds[is_ip6][0].as_u32[0] = r[0]; + tm->connection_hash_seeds[is_ip6][0].as_u32[1] = r[1]; + tm->connection_hash_seeds[is_ip6][1].as_u32[0] = r[2]; + tm->connection_hash_seeds[is_ip6][1].as_u32[1] = r[3]; + tm->connection_hash_seeds[is_ip6][2].as_u32[0] = r[4]; + tm->connection_hash_seeds[is_ip6][2].as_u32[1] = r[5]; + + ip46_tcp_lookup_init (vm, tm, is_ip6); + } + + { + tcp_listener_t * l; + + pool_get_aligned (tm->listener_pool, l, CLIB_CACHE_LINE_BYTES); + + /* Null listener must always have zero index. */ + ASSERT (l - tm->listener_pool == 0); + + memset (l, 0, sizeof (l[0])); + + /* No adjacencies are valid. */ + l->valid_local_adjacency_bitmap = 0; + + vec_validate_init_empty (tm->listener_index_by_dst_port, + (1 << 16) - 1, + l - tm->listener_pool); + } + + /* Initialize disposition table. */ + { + int i, j; + for (i = 0; i < ARRAY_LEN (tm->disposition_by_state_and_flags); i++) + for (j = 0; j < ARRAY_LEN (tm->disposition_by_state_and_flags[i]); j++) + { + tm->disposition_by_state_and_flags[i][j].next = TCP_LOOKUP_NEXT_DROP; + tm->disposition_by_state_and_flags[i][j].error = TCP_ERROR_LOOKUP_DROPS; + } + +#define _(t,f,n,e) \ +do { \ + tm->disposition_by_state_and_flags[TCP_CONNECTION_STATE_##t][f].next = (n); \ + tm->disposition_by_state_and_flags[TCP_CONNECTION_STATE_##t][f].error = (e); \ +} while (0) + + /* SYNs for new connections -> tcp-listen. */ + _ (unused, TCP_FLAG_SYN, + TCP_LOOKUP_NEXT_LISTEN_SYN, TCP_ERROR_NONE); + _ (listen_ack_wait, TCP_FLAG_ACK, + TCP_LOOKUP_NEXT_LISTEN_ACK, TCP_ERROR_NONE); + _ (established, TCP_FLAG_ACK, + TCP_LOOKUP_NEXT_ESTABLISHED, TCP_ERROR_NONE); + _ (established, TCP_FLAG_FIN | TCP_FLAG_ACK, + TCP_LOOKUP_NEXT_ESTABLISHED, TCP_ERROR_NONE); + +#undef _ + } + + /* IP4 packet templates. */ + { + ip4_tcp_syn_packet_t ip4_syn, ip4_syn_ack; + ip4_tcp_ack_packet_t ip4_ack, ip4_fin_ack, ip4_rst_ack; + ip6_tcp_syn_packet_t ip6_syn, ip6_syn_ack; + ip6_tcp_ack_packet_t ip6_ack, ip6_fin_ack, ip6_rst_ack; + + memset (&ip4_syn, 0, sizeof (ip4_syn)); + memset (&ip4_syn_ack, 0, sizeof (ip4_syn_ack)); + memset (&ip4_ack, 0, sizeof (ip4_ack)); + memset (&ip4_fin_ack, 0, sizeof (ip4_fin_ack)); + memset (&ip4_rst_ack, 0, sizeof (ip4_rst_ack)); + memset (&ip6_syn, 0, sizeof (ip6_syn)); + memset (&ip6_syn_ack, 0, sizeof (ip6_syn_ack)); + memset (&ip6_ack, 0, sizeof (ip6_ack)); + memset (&ip6_fin_ack, 0, sizeof (ip6_fin_ack)); + memset (&ip6_rst_ack, 0, sizeof (ip6_rst_ack)); + + ip4_tcp_packet_init (&ip4_syn.ip4, sizeof (ip4_syn)); + ip4_tcp_packet_init (&ip4_syn_ack.ip4, sizeof (ip4_syn_ack)); + ip4_tcp_packet_init (&ip4_ack.ip4, sizeof (ip4_ack)); + ip4_tcp_packet_init (&ip4_fin_ack.ip4, sizeof (ip4_fin_ack)); + ip4_tcp_packet_init (&ip4_rst_ack.ip4, sizeof (ip4_rst_ack)); + + ip6_tcp_packet_init (&ip6_syn.ip6, sizeof (ip6_syn)); + ip6_tcp_packet_init (&ip6_syn_ack.ip6, sizeof (ip6_syn_ack)); + ip6_tcp_packet_init (&ip6_ack.ip6, sizeof (ip6_ack)); + ip6_tcp_packet_init (&ip6_fin_ack.ip6, sizeof (ip6_fin_ack)); + ip6_tcp_packet_init (&ip6_rst_ack.ip6, sizeof (ip6_rst_ack)); + + /* TCP header. */ + { + u8 window_scale = 7; + tcp_syn_packet_t * s = &ip4_syn.tcp; + tcp_syn_packet_t * sa = &ip4_syn_ack.tcp; + tcp_ack_packet_t * a = &ip4_ack.tcp; + tcp_ack_packet_t * fa = &ip4_fin_ack.tcp; + tcp_ack_packet_t * ra = &ip4_rst_ack.tcp; + + s->header.tcp_header_u32s_and_reserved = (sizeof (s[0]) / sizeof (u32)) << 4; + a->header.tcp_header_u32s_and_reserved = (sizeof (a[0]) / sizeof (u32)) << 4; + + s->header.flags = TCP_FLAG_SYN; + a->header.flags = TCP_FLAG_ACK; + + s->header.window = clib_host_to_net_u16 (32 << (10 - window_scale)); + a->header.window = s->header.window; + + s->options.mss.type = TCP_OPTION_MSS; + s->options.mss.length = 4; + + s->options.window_scale.type = TCP_OPTION_WINDOW_SCALE; + s->options.window_scale.length = 3; + s->options.window_scale.value = window_scale; + + s->options.time_stamp.type = TCP_OPTION_TIME_STAMP; + s->options.time_stamp.length = 10; + + memset (&s->options.nops, TCP_OPTION_NOP, sizeof (s->options.nops)); + + /* SYN-ACK is same as SYN but with ACK flag set. */ + sa[0] = s[0]; + sa->header.flags |= TCP_FLAG_ACK; + + a->options.time_stamp.type = TCP_OPTION_TIME_STAMP; + a->options.time_stamp.length = 10; + memset (&a->options.nops, TCP_OPTION_NOP, sizeof (a->options.nops)); + + /* {FIN,RST}-ACK are same as ACK but with {FIN,RST} flag set. */ + fa[0] = a[0]; + fa->header.flags |= TCP_FLAG_FIN; + ra[0] = a[0]; + ra->header.flags |= TCP_FLAG_RST; + + /* IP6 TCP headers are identical. */ + ip6_syn.tcp = s[0]; + ip6_syn_ack.tcp = sa[0]; + ip6_ack.tcp = a[0]; + ip6_fin_ack.tcp = fa[0]; + ip6_rst_ack.tcp = ra[0]; + + /* TCP checksums. */ + { + ip_csum_t sum; + + sum = clib_host_to_net_u32 (sizeof (ip4_ack.tcp) + (ip4_ack.ip4.protocol << 16)); + sum = ip_incremental_checksum (sum, &ip4_ack.tcp, sizeof (ip4_ack.tcp)); + ip4_ack.tcp.header.checksum = ~ ip_csum_fold (sum); + + sum = clib_host_to_net_u32 (sizeof (ip4_fin_ack.tcp) + (ip4_fin_ack.ip4.protocol << 16)); + sum = ip_incremental_checksum (sum, &ip4_fin_ack.tcp, sizeof (ip4_fin_ack.tcp)); + ip4_fin_ack.tcp.header.checksum = ~ ip_csum_fold (sum); + + sum = clib_host_to_net_u32 (sizeof (ip4_rst_ack.tcp) + (ip4_rst_ack.ip4.protocol << 16)); + sum = ip_incremental_checksum (sum, &ip4_rst_ack.tcp, sizeof (ip4_rst_ack.tcp)); + ip4_rst_ack.tcp.header.checksum = ~ ip_csum_fold (sum); + + sum = clib_host_to_net_u32 (sizeof (ip4_syn.tcp) + (ip4_syn.ip4.protocol << 16)); + sum = ip_incremental_checksum (sum, &ip4_syn.tcp, sizeof (ip4_syn.tcp)); + ip4_syn.tcp.header.checksum = ~ ip_csum_fold (sum); + + sum = clib_host_to_net_u32 (sizeof (ip4_syn_ack.tcp) + (ip4_syn_ack.ip4.protocol << 16)); + sum = ip_incremental_checksum (sum, &ip4_syn_ack.tcp, sizeof (ip4_syn_ack.tcp)); + ip4_syn_ack.tcp.header.checksum = ~ ip_csum_fold (sum); + + sum = clib_host_to_net_u32 (sizeof (ip6_ack.tcp)) + ip6_ack.ip6.protocol; + sum = ip_incremental_checksum (sum, &ip6_ack.tcp, sizeof (ip6_ack.tcp)); + ip6_ack.tcp.header.checksum = ~ ip_csum_fold (sum); + + sum = clib_host_to_net_u32 (sizeof (ip6_fin_ack.tcp)) + ip6_fin_ack.ip6.protocol; + sum = ip_incremental_checksum (sum, &ip6_fin_ack.tcp, sizeof (ip6_fin_ack.tcp)); + ip6_fin_ack.tcp.header.checksum = ~ ip_csum_fold (sum); + + sum = clib_host_to_net_u32 (sizeof (ip6_rst_ack.tcp)) + ip6_rst_ack.ip6.protocol; + sum = ip_incremental_checksum (sum, &ip6_rst_ack.tcp, sizeof (ip6_rst_ack.tcp)); + ip6_rst_ack.tcp.header.checksum = ~ ip_csum_fold (sum); + + sum = clib_host_to_net_u32 (sizeof (ip6_syn.tcp)) + ip6_syn.ip6.protocol; + sum = ip_incremental_checksum (sum, &ip6_syn.tcp, sizeof (ip6_syn.tcp)); + ip6_syn.tcp.header.checksum = ~ ip_csum_fold (sum); + + sum = clib_host_to_net_u32 (sizeof (ip6_syn_ack.tcp)) + ip6_syn_ack.ip6.protocol; + sum = ip_incremental_checksum (sum, &ip6_syn_ack.tcp, sizeof (ip6_syn_ack.tcp)); + ip6_syn_ack.tcp.header.checksum = ~ ip_csum_fold (sum); + } + } + +#define _(t,x,n) \ +do { \ + vlib_packet_template_init \ + (vm, \ + &tm->ip4.packet_templates[t].vlib, \ + &x, sizeof (x), \ + /* alloc chunk size */ VLIB_FRAME_SIZE, \ + (n)); \ + tm->ip4.packet_templates[t].tcp_checksum_net_byte_order \ + = x.tcp.header.checksum; \ + tm->ip4.packet_templates[t].ip4_checksum_net_byte_order \ + = x.ip4.checksum; \ +} while (0) + + _ (TCP_PACKET_TEMPLATE_SYN, ip4_syn, "ip4 tcp syn"); + _ (TCP_PACKET_TEMPLATE_SYN_ACK, ip4_syn_ack, "ip4 tcp syn-ack"); + _ (TCP_PACKET_TEMPLATE_ACK, ip4_ack, "ip4 tcp ack"); + _ (TCP_PACKET_TEMPLATE_FIN_ACK, ip4_fin_ack, "ip4 tcp fin-ack"); + _ (TCP_PACKET_TEMPLATE_RST_ACK, ip4_rst_ack, "ip4 tcp rst-ack"); + +#undef _ + +#define _(t,x,n) \ +do { \ + vlib_packet_template_init \ + (vm, \ + &tm->ip6.packet_templates[t].vlib, \ + &x, sizeof (x), \ + /* alloc chunk size */ VLIB_FRAME_SIZE, \ + (n)); \ + tm->ip6.packet_templates[t].tcp_checksum_net_byte_order \ + = x.tcp.header.checksum; \ + tm->ip6.packet_templates[t].ip4_checksum_net_byte_order \ + = 0xdead; \ +} while (0) + + _ (TCP_PACKET_TEMPLATE_SYN, ip6_syn, "ip6 tcp syn"); + _ (TCP_PACKET_TEMPLATE_SYN_ACK, ip6_syn_ack, "ip6 tcp syn-ack"); + _ (TCP_PACKET_TEMPLATE_ACK, ip6_ack, "ip6 tcp ack"); + _ (TCP_PACKET_TEMPLATE_FIN_ACK, ip6_fin_ack, "ip6 tcp fin-ack"); + _ (TCP_PACKET_TEMPLATE_RST_ACK, ip6_rst_ack, "ip6 tcp rst-ack"); + +#undef _ + } +} + +static char * tcp_error_strings[] = { +#define _(sym,string) string, + foreach_tcp_error +#undef _ +}; + +VLIB_REGISTER_NODE (ip4_tcp_lookup_node,static) = { + .function = ip4_tcp_lookup, + .name = "ip4-tcp-lookup", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_LOOKUP_N_NEXT, + .next_nodes = { + [TCP_LOOKUP_NEXT_DROP] = "error-drop", + [TCP_LOOKUP_NEXT_PUNT] = "error-punt", + [TCP_LOOKUP_NEXT_LISTEN_SYN] = "ip4-tcp-listen", + [TCP_LOOKUP_NEXT_LISTEN_ACK] = "ip4-tcp-establish", + [TCP_LOOKUP_NEXT_CONNECT_SYN_ACK] = "ip4-tcp-connect", + [TCP_LOOKUP_NEXT_ESTABLISHED] = "ip4-tcp-established", + }, + + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, +}; + +VLIB_REGISTER_NODE (ip6_tcp_lookup_node,static) = { + .function = ip6_tcp_lookup, + .name = "ip6-tcp-lookup", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_LOOKUP_N_NEXT, + .next_nodes = { + [TCP_LOOKUP_NEXT_DROP] = "error-drop", + [TCP_LOOKUP_NEXT_PUNT] = "error-punt", + [TCP_LOOKUP_NEXT_LISTEN_SYN] = "ip6-tcp-listen", + [TCP_LOOKUP_NEXT_LISTEN_ACK] = "ip4-tcp-establish", + [TCP_LOOKUP_NEXT_CONNECT_SYN_ACK] = "ip6-tcp-connect", + [TCP_LOOKUP_NEXT_ESTABLISHED] = "ip6-tcp-established", + }, + + .n_errors = TCP_N_ERROR, + .error_strings = tcp_error_strings, +}; + +static_always_inline void +tcp_options_decode_for_syn (tcp_main_t * tm, tcp_mini_connection_t * m, tcp_header_t * tcp) +{ + u8 * o = (void *) (tcp + 1); + u32 n_bytes = (tcp->tcp_header_u32s_and_reserved >> 4) * sizeof (u32); + u8 * e = o + n_bytes; + tcp_mini_connection_t * tmpl = &tm->option_decode_mini_connection_template; + tcp_option_type_t t; + u8 i, l, * p; + u8 * option_decode[16]; + + /* Initialize defaults. */ + option_decode[TCP_OPTION_MSS] = (u8 *) &tmpl->max_segment_size; + option_decode[TCP_OPTION_WINDOW_SCALE] = (u8 *) &tmpl->window_scale; + option_decode[TCP_OPTION_TIME_STAMP] = (u8 *) &tmpl->time_stamps.his_net_byte_order; + + if (n_bytes > 0) + { +#define _ \ +do { \ + t = o[0]; \ + i = t >= ARRAY_LEN (option_decode) ? TCP_OPTION_END : t; \ + option_decode[i] = o + 2; \ + /* Skip nop; don't skip end; else length from packet. */ \ + l = t < 2 ? t : o[1]; \ + p = o + l; \ + o = p < e ? p : o; \ +} while (0) + + _; _; _; + /* Fast path: NOP NOP TIMESTAMP. */ + if (o >= e) goto done; + _; _; + if (o >= e) goto done; + _; _; _; + +#undef _ + + done:; + } + + m->max_segment_size = + clib_net_to_host_u16 (*(u16 *) option_decode[TCP_OPTION_MSS]); + m->window_scale = *option_decode[TCP_OPTION_WINDOW_SCALE]; + m->time_stamps.his_net_byte_order = ((u32 *) option_decode[TCP_OPTION_TIME_STAMP])[0]; +} + +static_always_inline u32 +tcp_options_decode_for_ack (tcp_main_t * tm, tcp_header_t * tcp, + u32 * his_time_stamp) +{ + u8 * o = (void *) (tcp + 1); + u32 n_bytes = (tcp->tcp_header_u32s_and_reserved >> 4) * sizeof (u32); + u8 * e = o + n_bytes; + tcp_option_type_t t; + u8 i, l, * p; + u8 * option_decode[16]; + u32 default_time_stamps[2]; + + /* Initialize defaults. */ + default_time_stamps[0] = default_time_stamps[1] = 0; + option_decode[TCP_OPTION_TIME_STAMP] = (u8 *) &default_time_stamps; + + if (n_bytes > 0) + { +#define _ \ +do { \ + t = o[0]; \ + i = t >= ARRAY_LEN (option_decode) ? TCP_OPTION_END : t; \ + option_decode[i] = o + 2; \ + /* Skip nop; don't skip end; else length from packet. */ \ + l = t < 2 ? t : o[1]; \ + p = o + l; \ + o = p < e ? p : o; \ +} while (0) + + _; _; _; + /* Fast path: NOP NOP TIMESTAMP. */ + if (o >= e) goto done; + _; _; + if (o >= e) goto done; + _; _; _; +#undef _ + + done:; + } + + if (his_time_stamp) + his_time_stamp[0] = ((u32 *) option_decode[TCP_OPTION_TIME_STAMP])[0]; + + return clib_net_to_host_u32 (((u32 *) option_decode[TCP_OPTION_TIME_STAMP])[1]); +} + +static void +tcp_options_decode_init (tcp_main_t * tm) +{ + tcp_mini_connection_t * m = &tm->option_decode_mini_connection_template; + + memset (m, 0, sizeof (m[0])); + m->max_segment_size = clib_host_to_net_u16 (576 - 40); + m->window_scale = 0; + m->time_stamps.his_net_byte_order = 0; +} + +/* Initialize target buffer as "related" to given buffer. */ +always_inline void +vlib_buffer_copy_shared_fields (vlib_main_t * vm, vlib_buffer_t * b, u32 bi_target) +{ + vlib_buffer_t * b_target = vlib_get_buffer (vm, bi_target); + vnet_buffer (b_target)->sw_if_index[VLIB_RX] = vnet_buffer (b)->sw_if_index[VLIB_RX]; + b_target->trace_index = b->trace_index; + b_target->flags |= b->flags & VLIB_BUFFER_IS_TRACED; +} + +typedef enum { + TCP_LISTEN_NEXT_DROP, + TCP_LISTEN_NEXT_REPLY, + TCP_LISTEN_N_NEXT, +} tcp_listen_next_t; + +static_always_inline uword +ip46_tcp_listen (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + uword is_ip6) +{ + tcp_main_t * tm = &tcp_main; + ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + uword n_packets = frame->n_vectors; + u32 * from, * to_reply, * to_drop, * random_ack_numbers; + u32 n_left_from, n_left_to_reply, n_left_to_drop, mini_now, timestamp_now; + u16 * fid, * fragment_ids; + vlib_node_runtime_t * error_node; + + error_node = vlib_node_get_runtime + (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index); + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + mini_now = tcp_time_now (tm, TCP_TIMER_mini_connection); + timestamp_now = tcp_time_now (tm, TCP_TIMER_timestamp); + + random_ack_numbers = clib_random_buffer_get_data (&vm->random_buffer, + n_packets * sizeof (random_ack_numbers[0])); + /* Get random fragment IDs for replies. */ + fid = fragment_ids = clib_random_buffer_get_data (&vm->random_buffer, + n_packets * sizeof (fragment_ids[0])); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, TCP_LISTEN_NEXT_REPLY, + to_reply, n_left_to_reply); + vlib_get_next_frame (vm, node, TCP_LISTEN_NEXT_DROP, + to_drop, n_left_to_drop); + + while (n_left_from > 0 && n_left_to_reply > 0 && n_left_to_drop > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip60; + ip4_header_t * ip40; + tcp_header_t * tcp0; + tcp_mini_connection_t * min0; + tcp_syn_packet_t * tcp_reply0; + ip_csum_t tcp_sum0; + u32 bi0, bi_reply0, imin0, my_seq_net0, his_seq_host0, his_seq_net0; + u8 i0; + + bi0 = to_drop[0] = from[0]; + + from += 1; + n_left_from -= 1; + to_drop += 1; + n_left_to_drop -= 1; + + p0 = vlib_get_buffer (vm, bi0); + + p0->error = error_node->errors[TCP_ERROR_LISTEN_RESPONSES]; + + imin0 = vnet_buffer (p0)->ip.tcp.mini_connection_index; + i0 = imin0 % 4; + + if (is_ip6) + { + ip6_tcp_udp_address_x4_and_timestamps_t * mina0; + + ip60 = vlib_buffer_get_current (p0); + tcp0 = ip6_next_header (ip60); + + mina0 = vec_elt_at_index (tm->ip6_mini_connection_address_hash, imin0 / 4); + + ip6_tcp_udp_address_x4_set_from_headers (&mina0->address_x4, + ip60, tcp0, i0); + mina0->time_stamps[i0] = mini_now; + } + else + { + ip4_tcp_udp_address_x4_and_timestamps_t * mina0; + + ip40 = vlib_buffer_get_current (p0); + tcp0 = ip4_next_header (ip40); + + mina0 = vec_elt_at_index (tm->ip4_mini_connection_address_hash, imin0 / 4); + + ip4_tcp_udp_address_x4_set_from_headers (&mina0->address_x4, + ip40, tcp0, i0); + mina0->time_stamps[i0] = mini_now; + } + + min0 = vec_elt_at_index (tm46->mini_connections, imin0); + + min0->state = TCP_CONNECTION_STATE_listen_ack_wait; + min0->time_stamps.ours_host_byte_order = timestamp_now; + tcp_options_decode_for_syn (tm, min0, tcp0); + + my_seq_net0 = *random_ack_numbers++; + his_seq_host0 = 1 + clib_net_to_host_u32 (tcp0->seq_number); + + min0->sequence_numbers.ours = 1 + clib_net_to_host_u32 (my_seq_net0); + min0->sequence_numbers.his = his_seq_host0; + + if (is_ip6) + { + ip6_tcp_syn_packet_t * r0; + uword tmp0, i; + + r0 = vlib_packet_template_get_packet + (vm, + &tm->ip6.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK].vlib, + &bi_reply0); + tcp_reply0 = &r0->tcp; + + tcp_sum0 = (tm->ip6.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK] + .tcp_checksum_net_byte_order); + + for (i = 0; i < ARRAY_LEN (ip60->dst_address.as_uword); i++) + { + tmp0 = r0->ip6.src_address.as_uword[i] = ip60->dst_address.as_uword[i]; + tcp_sum0 = ip_csum_add_even (tcp_sum0, tmp0); + + tmp0 = r0->ip6.dst_address.as_uword[i] = ip60->src_address.as_uword[i]; + tcp_sum0 = ip_csum_add_even (tcp_sum0, tmp0); + } + } + else + { + ip4_tcp_syn_packet_t * r0; + ip_csum_t ip_sum0; + u32 src0, dst0; + + r0 = vlib_packet_template_get_packet + (vm, + &tm->ip4.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK].vlib, + &bi_reply0); + tcp_reply0 = &r0->tcp; + + tcp_sum0 = (tm->ip4.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK] + .tcp_checksum_net_byte_order); + ip_sum0 = (tm->ip4.packet_templates[TCP_PACKET_TEMPLATE_SYN_ACK] + .ip4_checksum_net_byte_order); + + src0 = r0->ip4.src_address.as_u32 = ip40->dst_address.as_u32; + dst0 = r0->ip4.dst_address.as_u32 = ip40->src_address.as_u32; + + ip_sum0 = ip_csum_add_even (ip_sum0, src0); + tcp_sum0 = ip_csum_add_even (tcp_sum0, src0); + + ip_sum0 = ip_csum_add_even (ip_sum0, dst0); + tcp_sum0 = ip_csum_add_even (tcp_sum0, dst0); + + r0->ip4.checksum = ip_csum_fold (ip_sum0); + + ASSERT (r0->ip4.checksum == ip4_header_checksum (&r0->ip4)); + } + + tcp_reply0->header.ports.src = tcp0->ports.dst; + tcp_reply0->header.ports.dst = tcp0->ports.src; + tcp_sum0 = ip_csum_add_even (tcp_sum0, tcp_reply0->header.ports.src_and_dst); + + tcp_reply0->header.seq_number = my_seq_net0; + tcp_sum0 = ip_csum_add_even (tcp_sum0, my_seq_net0); + + his_seq_net0 = clib_host_to_net_u32 (his_seq_host0); + tcp_reply0->header.ack_number = his_seq_net0; + tcp_sum0 = ip_csum_add_even (tcp_sum0, his_seq_net0); + + { + ip_adjacency_t * adj0 = ip_get_adjacency (&ip4_main.lookup_main, vnet_buffer (p0)->ip.adj_index[VLIB_RX]); + u16 my_mss = + (adj0->rewrite_header.max_l3_packet_bytes + - (is_ip6 ? sizeof (ip60[0]) : sizeof (ip40[0])) + - sizeof (tcp0[0])); + + my_mss = clib_min (my_mss, min0->max_segment_size); + min0->max_segment_size = my_mss; + + tcp_reply0->options.mss.value = clib_host_to_net_u16 (my_mss); + tcp_sum0 = ip_csum_add_even (tcp_sum0, tcp_reply0->options.mss.value); + } + + tcp_reply0->options.time_stamp.my_time_stamp = clib_host_to_net_u32 (timestamp_now); + tcp_sum0 = ip_csum_add_even (tcp_sum0, tcp_reply0->options.time_stamp.my_time_stamp); + + tcp_reply0->options.time_stamp.his_time_stamp = min0->time_stamps.his_net_byte_order; + tcp_sum0 = ip_csum_add_even (tcp_sum0, tcp_reply0->options.time_stamp.his_time_stamp); + + tcp_reply0->header.checksum = ip_csum_fold (tcp_sum0); + + vlib_buffer_copy_shared_fields (vm, p0, bi_reply0); + + to_reply[0] = bi_reply0; + n_left_to_reply -= 1; + to_reply += 1; + } + + vlib_put_next_frame (vm, node, TCP_LISTEN_NEXT_REPLY, n_left_to_reply); + vlib_put_next_frame (vm, node, TCP_LISTEN_NEXT_DROP, n_left_to_drop); + } + + if (node->flags & VLIB_NODE_FLAG_TRACE) + /* FIXME */ ; + + return frame->n_vectors; +} + +static uword +ip4_tcp_listen (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_listen (vm, node, frame, /* is_ip6 */ 0); } + +static uword +ip6_tcp_listen (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_listen (vm, node, frame, /* is_ip6 */ 1); } + +VLIB_REGISTER_NODE (ip4_tcp_listen_node,static) = { + .function = ip4_tcp_listen, + .name = "ip4-tcp-listen", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = { + [TCP_LISTEN_NEXT_DROP] = "error-drop", + [TCP_LISTEN_NEXT_REPLY] = CLIB_DEBUG > 0 ? "ip4-input" : "ip4-lookup", + }, +}; + +VLIB_REGISTER_NODE (ip6_tcp_listen_node,static) = { + .function = ip6_tcp_listen, + .name = "ip6-tcp-listen", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_LISTEN_N_NEXT, + .next_nodes = { + [TCP_LISTEN_NEXT_DROP] = "error-drop", + [TCP_LISTEN_NEXT_REPLY] = CLIB_DEBUG > 0 ? "ip6-input" : "ip6-lookup", + }, +}; + +typedef enum { + TCP_CONNECT_NEXT_DROP, + TCP_CONNECT_NEXT_REPLY, + TCP_CONNECT_N_NEXT, +} tcp_connect_next_t; + +static_always_inline uword +ip46_tcp_connect (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + uword is_ip6) +{ + tcp_main_t * tm = &tcp_main; + ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + uword n_packets = frame->n_vectors; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next; + vlib_node_runtime_t * error_node; + + /* FIXME */ + clib_warning ("%p", tm46); + + error_node = vlib_node_get_runtime + (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index); + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + next = node->cached_next_index; + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip60; + ip4_header_t * ip40; + tcp_header_t * tcp0; + u32 bi0; + u8 error0, next0; + + bi0 = to_next[0] = from[0]; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + + if (is_ip6) + { + ip60 = vlib_buffer_get_current (p0); + tcp0 = ip6_next_header (ip60); + } + else + { + ip40 = vlib_buffer_get_current (p0); + tcp0 = ip4_next_header (ip40); + } + + ASSERT (0); + + error0 = next0 = 0; + p0->error = error_node->errors[error0]; + + if (PREDICT_FALSE (next0 != next)) + { + to_next -= 1; + n_left_to_next += 1; + + vlib_put_next_frame (vm, node, next, n_left_to_next); + + next = next0; + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + } + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + if (node->flags & VLIB_NODE_FLAG_TRACE) + /* FIXME */ ; + + return frame->n_vectors; +} + +static uword +ip4_tcp_connect (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_connect (vm, node, frame, /* is_ip6 */ 0); } + +static uword +ip6_tcp_connect (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_connect (vm, node, frame, /* is_ip6 */ 1); } + +VLIB_REGISTER_NODE (ip4_tcp_connect_node,static) = { + .function = ip4_tcp_connect, + .name = "ip4-tcp-connect", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_CONNECT_N_NEXT, + .next_nodes = { + [TCP_CONNECT_NEXT_DROP] = "error-drop", + [TCP_CONNECT_NEXT_REPLY] = CLIB_DEBUG > 0 ? "ip4-input" : "ip4-lookup", + }, +}; + +VLIB_REGISTER_NODE (ip6_tcp_connect_node,static) = { + .function = ip6_tcp_connect, + .name = "ip6-tcp-connect", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_CONNECT_N_NEXT, + .next_nodes = { + [TCP_CONNECT_NEXT_DROP] = "error-drop", + [TCP_CONNECT_NEXT_REPLY] = CLIB_DEBUG > 0 ? "ip6-input" : "ip6-lookup", + }, +}; + +typedef enum { + TCP_ESTABLISH_NEXT_DROP, + TCP_ESTABLISH_NEXT_ESTABLISHED, + TCP_ESTABLISH_N_NEXT, +} tcp_establish_next_t; + +static_always_inline uword +ip46_tcp_establish (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + uword is_ip6) +{ + tcp_main_t * tm = &tcp_main; + ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + uword n_packets = frame->n_vectors; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next, mini_long_long_ago, timestamp_now; + vlib_node_runtime_t * error_node; + + error_node = vlib_node_get_runtime + (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index); + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + next = node->cached_next_index; + mini_long_long_ago = + (tcp_time_now (tm, TCP_TIMER_mini_connection) + + (1 << (BITS (mini_long_long_ago) - 1))); + timestamp_now = tcp_time_now (tm, TCP_TIMER_timestamp); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip60; + ip4_header_t * ip40; + tcp_header_t * tcp0; + tcp_mini_connection_t * min0; + tcp_connection_t * est0; + tcp_listener_t * l0; + u32 bi0, imin0, iest0; + u8 error0, next0, i0, e0; + + bi0 = to_next[0] = from[0]; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + + imin0 = vnet_buffer (p0)->ip.tcp.mini_connection_index; + iest0 = vnet_buffer (p0)->ip.tcp.established_connection_index; + + i0 = imin0 % 4; + e0 = iest0 % 4; + + min0 = vec_elt_at_index (tm46->mini_connections, imin0); + if (PREDICT_FALSE (min0->state == TCP_CONNECTION_STATE_unused)) + goto already_established0; + min0->state = TCP_CONNECTION_STATE_unused; + + if (is_ip6) + { + ip60 = vlib_buffer_get_current (p0); + tcp0 = ip6_next_header (ip60); + } + else + { + ip40 = vlib_buffer_get_current (p0); + tcp0 = ip4_next_header (ip40); + } + + if (PREDICT_FALSE (clib_net_to_host_u32 (tcp0->seq_number) + != min0->sequence_numbers.his)) + goto unexpected_seq_number0; + if (PREDICT_FALSE (clib_net_to_host_u32 (tcp0->ack_number) + != min0->sequence_numbers.ours)) + goto unexpected_ack_number0; + + if (is_ip6) + { + ip6_tcp_udp_address_x4_and_timestamps_t * mina0; + ip6_tcp_udp_address_x4_t * esta0; + + mina0 = vec_elt_at_index (tm->ip6_mini_connection_address_hash, imin0 / 4); + esta0 = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest0 / 4); + + ip6_tcp_udp_address_x4_copy_and_invalidate (esta0, &mina0->address_x4, e0, i0); + + mina0->time_stamps[i0] = mini_long_long_ago; + } + else + { + ip4_tcp_udp_address_x4_and_timestamps_t * mina0; + ip4_tcp_udp_address_x4_t * esta0; + + mina0 = vec_elt_at_index (tm->ip4_mini_connection_address_hash, imin0 / 4); + esta0 = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest0 / 4); + + ip4_tcp_udp_address_x4_copy_and_invalidate (esta0, &mina0->address_x4, e0, i0); + + mina0->time_stamps[i0] = mini_long_long_ago; + } + + est0 = vec_elt_at_index (tm46->established_connections, iest0); + + est0->sequence_numbers = min0->sequence_numbers; + est0->max_segment_size = (min0->max_segment_size + - STRUCT_SIZE_OF (tcp_ack_packet_t, options)); + est0->his_window_scale = min0->window_scale; + est0->his_window = clib_net_to_host_u16 (tcp0->window); + est0->time_stamps.ours_host_byte_order = min0->time_stamps.ours_host_byte_order; + + /* Compute first measurement of round trip time. */ + { + u32 t = tcp_options_decode_for_ack (tm, tcp0, &est0->time_stamps.his_net_byte_order); + f64 dt = (timestamp_now - t) * tm->secs_per_tick[TCP_TIMER_timestamp]; + est0->round_trip_time_stats.sum = dt; + est0->round_trip_time_stats.sum2 = dt*dt; + est0->round_trip_time_stats.count = 1; + + { + ELOG_TYPE_DECLARE (e) = { + .format = "establish ack rtt: %.4e", + .format_args = "f8", + }; + struct { f64 dt; } * ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->dt = dt; + } + } + + est0->my_window_scale = 7; + est0->my_window = 256; + + l0 = pool_elt_at_index (tm->listener_pool, vnet_buffer (p0)->ip.tcp.listener_index); + vec_add1 (l0->event_connections[is_ip6], tcp_connection_handle_set (iest0, is_ip6)); + + next0 = TCP_ESTABLISH_NEXT_DROP; + error0 = TCP_ERROR_LISTENS_ESTABLISHED; + + enqueue0: + p0->error = error_node->errors[error0]; + if (PREDICT_FALSE (next0 != next)) + { + to_next -= 1; + n_left_to_next += 1; + + vlib_put_next_frame (vm, node, next, n_left_to_next); + + next = next0; + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + } + continue; + + already_established0: + next0 = TCP_ESTABLISH_NEXT_ESTABLISHED; + error0 = TCP_ERROR_NONE; + goto enqueue0; + + unexpected_seq_number0: + next0 = TCP_ESTABLISH_NEXT_DROP; + error0 = TCP_ERROR_UNEXPECTED_SEQ_NUMBER; + goto enqueue0; + + unexpected_ack_number0: + next0 = TCP_ESTABLISH_NEXT_DROP; + error0 = TCP_ERROR_UNEXPECTED_ACK_NUMBER; + goto enqueue0; + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + if (node->flags & VLIB_NODE_FLAG_TRACE) + /* FIXME */ ; + + /* Inform listeners of new connections. */ + { + tcp_listener_t * l; + uword n; + pool_foreach (l, tm->listener_pool, ({ + if ((n = vec_len (l->event_connections[is_ip6])) > 0) + { + if (l->event_function) + l->event_function (l->event_connections[is_ip6], + TCP_EVENT_connection_established); + if (tm->n_established_connections[is_ip6] == 0) + vlib_node_set_state (vm, tm46->output_node_index, VLIB_NODE_STATE_POLLING); + tm->n_established_connections[is_ip6] += n; + _vec_len (l->event_connections[is_ip6]) = 0; + } + })); + } + + return frame->n_vectors; +} + +static uword +ip4_tcp_establish (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_establish (vm, node, frame, /* is_ip6 */ 0); } + +static uword +ip6_tcp_establish (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_establish (vm, node, frame, /* is_ip6 */ 1); } + +VLIB_REGISTER_NODE (ip4_tcp_establish_node,static) = { + .function = ip4_tcp_establish, + .name = "ip4-tcp-establish", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_ESTABLISH_N_NEXT, + .next_nodes = { + [TCP_ESTABLISH_NEXT_DROP] = "error-drop", + [TCP_ESTABLISH_NEXT_ESTABLISHED] = "ip4-tcp-established", + }, +}; + +VLIB_REGISTER_NODE (ip6_tcp_establish_node,static) = { + .function = ip6_tcp_establish, + .name = "ip6-tcp-establish", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_ESTABLISH_N_NEXT, + .next_nodes = { + [TCP_ESTABLISH_NEXT_DROP] = "error-drop", + [TCP_ESTABLISH_NEXT_ESTABLISHED] = "ip6-tcp-established", + }, +}; + +static_always_inline void +tcp_free_connection_x1 (vlib_main_t * vm, tcp_main_t * tm, + tcp_ip_4_or_6_t is_ip6, + u32 iest0) +{ + ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + tcp_connection_t * est0; + u32 iest_div0, iest_mod0; + + iest_div0 = iest0 / 4; + iest_mod0 = iest0 % 4; + + if (is_ip6) + { + ip6_tcp_udp_address_x4_t * esta0; + esta0 = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest_div0); + ip6_tcp_udp_address_x4_invalidate (esta0, iest_mod0); + } + else + { + ip4_tcp_udp_address_x4_t * esta0; + esta0 = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest_div0); + ip4_tcp_udp_address_x4_invalidate (esta0, iest_mod0); + } + + est0 = vec_elt_at_index (tm46->established_connections, iest0); +} + +static_always_inline void +tcp_free_connection_x2 (vlib_main_t * vm, tcp_main_t * tm, + tcp_ip_4_or_6_t is_ip6, + u32 iest0, u32 iest1) +{ + tcp_free_connection_x1 (vm, tm, is_ip6, iest0); + tcp_free_connection_x1 (vm, tm, is_ip6, iest1); +} + +static_always_inline uword +ip46_tcp_output (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + tcp_ip_4_or_6_t is_ip6) +{ + tcp_main_t * tm = &tcp_main; + ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + u32 * cis, * to_next, n_left_to_next, n_connections_left; + u32 timestamp_now_host_byte_order, timestamp_now_net_byte_order; + vlib_node_runtime_t * error_node; + const u32 next = 0; + uword n_acks; + + /* Inform listeners of new connections. */ + { + tcp_listener_t * l; + pool_foreach (l, tm->listener_pool, ({ + if (vec_len (l->eof_connections) > 0) + { + if (l->event_function) + l->event_function (l->eof_connections[is_ip6], TCP_EVENT_fin_received); + else + { + uword i; + for (i = 0; i < vec_len (l->eof_connections[is_ip6]); i++) + { + tcp_connection_t * c = tcp_get_connection (l->eof_connections[is_ip6][i]); + c->flags |= TCP_CONNECTION_FLAG_application_requested_close; + } + } + _vec_len (l->eof_connections[is_ip6]) = 0; + } + + if (vec_len (l->close_connections[is_ip6]) > 0) + { + uword n_left; + u32 * cis; + + if (l->event_function) + l->event_function (l->close_connections[is_ip6], TCP_EVENT_connection_closed); + + cis = l->close_connections[is_ip6]; + n_left = vec_len (cis); + ASSERT (tm->n_established_connections[is_ip6] >= n_left); + tm->n_established_connections[is_ip6] -= n_left; + if (tm->n_established_connections[is_ip6] == 0) + vlib_node_set_state (vm, tm46->output_node_index, VLIB_NODE_STATE_DISABLED); + while (n_left >= 2) + { + tcp_free_connection_x2 (vm, tm, is_ip6, cis[0], cis[1]); + n_left -= 2; + cis += 2; + } + + while (n_left > 0) + { + tcp_free_connection_x1 (vm, tm, is_ip6, cis[0]); + n_left -= 1; + cis += 1; + } + + _vec_len (l->close_connections[is_ip6]) = 0; + } + })); + } + + n_acks = 0; + cis = tm46->connections_pending_acks; + n_connections_left = vec_len (cis); + if (n_connections_left == 0) + return n_acks; + _vec_len (tm46->connections_pending_acks) = 0; + error_node = vlib_node_get_runtime + (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index); + + timestamp_now_host_byte_order = tcp_time_now (tm, TCP_TIMER_timestamp); + timestamp_now_net_byte_order = clib_host_to_net_u32 (timestamp_now_host_byte_order); + + while (n_connections_left > 0) + { + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + + while (n_connections_left > 0 && n_left_to_next > 0) + { + tcp_connection_t * est0; + tcp_ack_packet_t * tcp0; + tcp_udp_ports_t * ports0; + ip_csum_t tcp_sum0; + tcp_packet_template_type_t template_type0; + u32 bi0, iest0, iest_div0, iest_mod0, my_seq_net0, his_seq_net0; + u8 is_fin0; + + iest0 = cis[0]; + cis += 1; + iest_div0 = iest0 / 4; + iest_mod0 = iest0 % 4; + est0 = vec_elt_at_index (tm46->established_connections, iest0); + + /* Send a FIN along with our ACK if application closed connection. */ + { + u8 is_closed0, fin_sent0; + + is_closed0 = (est0->flags & TCP_CONNECTION_FLAG_application_requested_close) != 0; + fin_sent0 = (est0->flags & TCP_CONNECTION_FLAG_fin_sent) != 0; + + is_fin0 = is_closed0 && ! fin_sent0; + template_type0 = + (is_fin0 + ? TCP_PACKET_TEMPLATE_FIN_ACK + : TCP_PACKET_TEMPLATE_ACK); + est0->flags |= is_closed0 << LOG2_TCP_CONNECTION_FLAG_fin_sent; + } + + if (is_ip6) + { + ip6_tcp_ack_packet_t * r0; + ip6_tcp_udp_address_x4_t * esta0; + uword tmp0, i; + + esta0 = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest_div0); + r0 = vlib_packet_template_get_packet + (vm, &tm->ip6.packet_templates[template_type0].vlib, &bi0); + tcp0 = &r0->tcp; + + tcp_sum0 = (tm->ip6.packet_templates[template_type0] + .tcp_checksum_net_byte_order); + + for (i = 0; i < ARRAY_LEN (r0->ip6.src_address.as_u32); i++) + { + tmp0 = r0->ip6.src_address.as_u32[i] = esta0->dst.as_u32[i][iest_mod0]; + tcp_sum0 = ip_csum_add_even (tcp_sum0, tmp0); + + tmp0 = r0->ip6.dst_address.as_u32[i] = esta0->src.as_u32[i][iest_mod0]; + tcp_sum0 = ip_csum_add_even (tcp_sum0, tmp0); + } + + ports0 = &esta0->ports.as_ports[iest_mod0]; + } + else + { + ip4_tcp_ack_packet_t * r0; + ip4_tcp_udp_address_x4_t * esta0; + ip_csum_t ip_sum0; + u32 src0, dst0; + + esta0 = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest_div0); + r0 = vlib_packet_template_get_packet + (vm, &tm->ip4.packet_templates[template_type0].vlib, &bi0); + tcp0 = &r0->tcp; + + ip_sum0 = (tm->ip4.packet_templates[template_type0] + .ip4_checksum_net_byte_order); + tcp_sum0 = (tm->ip4.packet_templates[template_type0] + .tcp_checksum_net_byte_order); + + src0 = r0->ip4.src_address.as_u32 = esta0->dst.as_ip4_address[iest_mod0].as_u32; + dst0 = r0->ip4.dst_address.as_u32 = esta0->src.as_ip4_address[iest_mod0].as_u32; + + ip_sum0 = ip_csum_add_even (ip_sum0, src0); + tcp_sum0 = ip_csum_add_even (tcp_sum0, src0); + + ip_sum0 = ip_csum_add_even (ip_sum0, dst0); + tcp_sum0 = ip_csum_add_even (tcp_sum0, dst0); + + r0->ip4.checksum = ip_csum_fold (ip_sum0); + + ASSERT (r0->ip4.checksum == ip4_header_checksum (&r0->ip4)); + ports0 = &esta0->ports.as_ports[iest_mod0]; + } + + tcp_sum0 = ip_csum_add_even (tcp_sum0, ports0->as_u32); + tcp0->header.ports.src = ports0->dst; + tcp0->header.ports.dst = ports0->src; + + my_seq_net0 = clib_host_to_net_u32 (est0->sequence_numbers.ours); + his_seq_net0 = clib_host_to_net_u32 (est0->sequence_numbers.his); + + /* FIN accounts for 1 sequence number. */ + est0->sequence_numbers.ours += is_fin0; + + tcp0->header.seq_number = my_seq_net0; + tcp_sum0 = ip_csum_add_even (tcp_sum0, my_seq_net0); + + tcp0->header.ack_number = his_seq_net0; + tcp_sum0 = ip_csum_add_even (tcp_sum0, his_seq_net0); + + est0->time_stamps.ours_host_byte_order = timestamp_now_host_byte_order; + tcp0->options.time_stamp.my_time_stamp = timestamp_now_net_byte_order; + tcp_sum0 = ip_csum_add_even (tcp_sum0, timestamp_now_net_byte_order); + + tcp0->options.time_stamp.his_time_stamp = est0->time_stamps.his_net_byte_order; + tcp_sum0 = ip_csum_add_even (tcp_sum0, est0->time_stamps.his_net_byte_order); + + tcp0->header.checksum = ip_csum_fold (tcp_sum0); + + est0->flags &= ~TCP_CONNECTION_FLAG_ack_pending; + + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + n_connections_left -= 1; + n_acks += 1; + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + vlib_error_count (vm, error_node->node_index, TCP_ERROR_ACKS_SENT, n_acks); + + return n_acks; +} + +static uword +ip4_tcp_output (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_output (vm, node, frame, /* is_ip6 */ 0); } + +static uword +ip6_tcp_output (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_output (vm, node, frame, /* is_ip6 */ 1); } + +VLIB_REGISTER_NODE (ip4_tcp_output_node,static) = { + .function = ip4_tcp_output, + .name = "ip4-tcp-output", + .state = VLIB_NODE_STATE_DISABLED, + .type = VLIB_NODE_TYPE_INPUT, + + .vector_size = sizeof (u32), + + .n_next_nodes = 1, + .next_nodes = { + [0] = CLIB_DEBUG > 0 ? "ip4-input" : "ip4-lookup", + }, +}; + +VLIB_REGISTER_NODE (ip6_tcp_output_node,static) = { + .function = ip6_tcp_output, + .name = "ip6-tcp-output", + .state = VLIB_NODE_STATE_DISABLED, + .type = VLIB_NODE_TYPE_INPUT, + + .vector_size = sizeof (u32), + + .n_next_nodes = 1, + .next_nodes = { + [0] = CLIB_DEBUG > 0 ? "ip6-input" : "ip6-lookup", + }, +}; + +static_always_inline void +tcp_ack (tcp_main_t * tm, tcp_connection_t * c, u32 n_bytes) +{ + ASSERT (n_bytes == 0); +} + +typedef enum { + TCP_ESTABLISHED_NEXT_DROP, + TCP_ESTABLISHED_N_NEXT, +} tcp_established_next_t; + +static_always_inline uword +ip46_tcp_established (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame, + tcp_ip_4_or_6_t is_ip6) +{ + tcp_main_t * tm = &tcp_main; + ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + uword n_packets = frame->n_vectors; + u32 * from, * to_next; + u32 n_left_from, n_left_to_next, next, timestamp_now; + vlib_node_runtime_t * error_node; + + error_node = vlib_node_get_runtime + (vm, is_ip6 ? ip6_tcp_lookup_node.index : ip4_tcp_lookup_node.index); + + from = vlib_frame_vector_args (frame); + n_left_from = n_packets; + next = node->cached_next_index; + timestamp_now = tcp_time_now (tm, TCP_TIMER_timestamp); + + while (n_left_from > 0) + { + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + + while (n_left_from > 0 && n_left_to_next > 0) + { + vlib_buffer_t * p0; + ip6_header_t * ip60; + ip4_header_t * ip40; + tcp_header_t * tcp0; + tcp_connection_t * est0; + tcp_listener_t * l0; + u32 bi0, iest0, n_data_bytes0, his_ack_host0, n_ack0; + u8 error0, next0, n_advance_bytes0, is_fin0, send_ack0; + + bi0 = to_next[0] = from[0]; + + from += 1; + n_left_from -= 1; + to_next += 1; + n_left_to_next -= 1; + + p0 = vlib_get_buffer (vm, bi0); + + if (is_ip6) + { + ip60 = vlib_buffer_get_current (p0); + tcp0 = ip6_next_header (ip60); + ASSERT (ip60->protocol == IP_PROTOCOL_TCP); + n_advance_bytes0 = tcp_header_bytes (tcp0); + n_data_bytes0 = clib_net_to_host_u16 (ip60->payload_length) - n_advance_bytes0; + n_advance_bytes0 += sizeof (ip60[0]); + } + else + { + ip40 = vlib_buffer_get_current (p0); + tcp0 = ip4_next_header (ip40); + n_advance_bytes0 = (ip4_header_bytes (ip40) + + tcp_header_bytes (tcp0)); + n_data_bytes0 = clib_net_to_host_u16 (ip40->length) - n_advance_bytes0; + } + + iest0 = vnet_buffer (p0)->ip.tcp.established_connection_index; + est0 = vec_elt_at_index (tm46->established_connections, iest0); + + error0 = TCP_ERROR_NO_DATA; + next0 = TCP_ESTABLISHED_NEXT_DROP; + + if (PREDICT_FALSE (clib_net_to_host_u32 (tcp0->seq_number) + != est0->sequence_numbers.his)) + goto unexpected_seq_number0; + if (PREDICT_FALSE (clib_net_to_host_u32 (tcp0->ack_number) - est0->sequence_numbers.ours + > est0->n_tx_unacked_bytes)) + goto unexpected_ack_number0; + + is_fin0 = (tcp0->flags & TCP_FLAG_FIN) != 0; + + if (PREDICT_FALSE ((est0->flags & TCP_CONNECTION_FLAG_fin_received) + && (is_fin0 || n_data_bytes0 > 0))) + goto already_received_fin0; + + /* Update window. */ + est0->his_window = clib_net_to_host_u16 (tcp0->window); + + /* Update his sequence number to account for data he's just sent. */ + est0->sequence_numbers.his += n_data_bytes0 + is_fin0; + + his_ack_host0 = clib_net_to_host_u32 (tcp0->ack_number); + n_ack0 = his_ack_host0 - est0->sequence_numbers.ours; + tcp_ack (tm, est0, n_ack0); + est0->sequence_numbers.ours = his_ack_host0; + + { + u32 t = tcp_options_decode_for_ack (tm, tcp0, &est0->time_stamps.his_net_byte_order); + if (t != est0->time_stamps.ours_host_byte_order) + { + f64 dt = (timestamp_now - t) * tm->secs_per_tick[TCP_TIMER_timestamp]; + est0->round_trip_time_stats.sum += dt; + est0->round_trip_time_stats.sum2 += dt*dt; + est0->round_trip_time_stats.count += 1; + est0->time_stamps.ours_host_byte_order = t; + + { + ELOG_TYPE_DECLARE (e) = { + .format = "ack rtt: %.4e", + .format_args = "f8", + }; + struct { f64 dt; } * ed; + ed = ELOG_DATA (&vm->elog_main, e); + ed->dt = dt; + } + } + } + + send_ack0 = ((est0->flags & TCP_CONNECTION_FLAG_ack_pending) == 0 + && (n_data_bytes0 > 0 || is_fin0)); + vec_add1 (tm46->connections_pending_acks, vnet_buffer (p0)->ip.tcp.established_connection_index); + _vec_len (tm46->connections_pending_acks) -= ! send_ack0; + est0->flags |= send_ack0 << LOG2_TCP_CONNECTION_FLAG_ack_pending; + + est0->flags |= is_fin0 << LOG2_TCP_CONNECTION_FLAG_fin_received; + + l0 = pool_elt_at_index (tm->listener_pool, vnet_buffer (p0)->ip.tcp.listener_index); + + { + u32 ch0 = tcp_connection_handle_set (iest0, is_ip6); + + vec_add1 (l0->eof_connections[is_ip6], ch0); + _vec_len (l0->eof_connections[is_ip6]) -= ! is_fin0; + + vec_add1 (l0->close_connections[is_ip6], ch0); + _vec_len (l0->close_connections[is_ip6]) -= !(est0->flags & TCP_CONNECTION_FLAG_fin_sent); + } + + next0 = n_data_bytes0 > 0 ? l0->next_index : next0; + + vlib_buffer_advance (p0, n_advance_bytes0); + + enqueue0: + p0->error = error_node->errors[error0]; + if (PREDICT_FALSE (next0 != next)) + { + to_next -= 1; + n_left_to_next += 1; + + vlib_put_next_frame (vm, node, next, n_left_to_next); + + next = next0; + vlib_get_next_frame (vm, node, next, to_next, n_left_to_next); + to_next[0] = bi0; + to_next += 1; + n_left_to_next -= 1; + } + continue; + + unexpected_seq_number0: + next0 = TCP_ESTABLISHED_NEXT_DROP; + error0 = TCP_ERROR_UNEXPECTED_SEQ_NUMBER; + goto enqueue0; + + unexpected_ack_number0: + next0 = TCP_ESTABLISHED_NEXT_DROP; + error0 = TCP_ERROR_UNEXPECTED_ACK_NUMBER; + goto enqueue0; + + already_received_fin0: + next0 = TCP_ESTABLISHED_NEXT_DROP; + error0 = TCP_ERROR_SEGMENT_AFTER_FIN; + goto enqueue0; + } + + vlib_put_next_frame (vm, node, next, n_left_to_next); + } + + if (node->flags & VLIB_NODE_FLAG_TRACE) + /* FIXME */ ; + + return frame->n_vectors; +} + +static uword +ip4_tcp_established (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_established (vm, node, frame, /* is_ip6 */ 0); } + +static uword +ip6_tcp_established (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * frame) +{ return ip46_tcp_established (vm, node, frame, /* is_ip6 */ 1); } + +VLIB_REGISTER_NODE (ip4_tcp_established_node,static) = { + .function = ip4_tcp_established, + .name = "ip4-tcp-established", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = { + [TCP_ESTABLISHED_NEXT_DROP] = "error-drop", + }, +}; + +VLIB_REGISTER_NODE (ip6_tcp_established_node,static) = { + .function = ip6_tcp_established, + .name = "ip6-tcp-established", + + .vector_size = sizeof (u32), + + .n_next_nodes = TCP_ESTABLISHED_N_NEXT, + .next_nodes = { + [TCP_ESTABLISHED_NEXT_DROP] = "error-drop", + }, +}; + +uword +tcp_register_listener (vlib_main_t * vm, + tcp_listener_registration_t * r) +{ + tcp_main_t * tm = &tcp_main; + tcp_listener_t * l; + + { + clib_error_t * error; + + if ((error = vlib_call_init_function (vm, tcp_udp_lookup_init))) + clib_error_report (error); + } + + pool_get_aligned (tm->listener_pool, l, CLIB_CACHE_LINE_BYTES); + + memset (l, 0, sizeof (l[0])); + + l->dst_port = r->port; + l->next_index = vlib_node_add_next (vm, ip4_tcp_established_node.index, r->data_node_index); + l->valid_local_adjacency_bitmap = 0; + l->flags = r->flags & (TCP_LISTENER_IP4 | TCP_LISTENER_IP6); + + tm->listener_index_by_dst_port[clib_host_to_net_u16 (l->dst_port)] = l - tm->listener_pool; + + return l - tm->listener_pool; +} + +static void +tcp_udp_lookup_ip4_add_del_interface_address (ip4_main_t * im, + uword opaque, + u32 sw_if_index, + ip4_address_t * address, + u32 address_length, + u32 if_address_index, + u32 is_delete) +{ + tcp_main_t * tm = &tcp_main; + + tm->ip4.default_valid_local_adjacency_bitmap + = clib_bitmap_set (tm->ip4.default_valid_local_adjacency_bitmap, + if_address_index, + is_delete ? 0 : 1); +} + +static void +tcp_udp_lookup_ip6_add_del_interface_address (ip6_main_t * im, + uword opaque, + u32 sw_if_index, + ip6_address_t * address, + u32 address_length, + u32 if_address_index, + u32 is_delete) +{ + tcp_main_t * tm = &tcp_main; + + tm->ip6.default_valid_local_adjacency_bitmap + = clib_bitmap_set (tm->ip6.default_valid_local_adjacency_bitmap, + if_address_index, + is_delete ? 0 : 1); +} + +static clib_error_t * +tcp_udp_lookup_init (vlib_main_t * vm) +{ + tcp_main_t * tm = &tcp_main; + ip4_main_t * im4 = &ip4_main; + ip6_main_t * im6 = &ip6_main; + clib_error_t * error; + + if ((error = vlib_call_init_function (vm, ip4_lookup_init))) + return error; + if ((error = vlib_call_init_function (vm, ip6_lookup_init))) + return error; + + tcp_time_init (vm, tm); + + { + ip4_add_del_interface_address_callback_t cb; + + cb.function = tcp_udp_lookup_ip4_add_del_interface_address; + cb.function_opaque = 0; + vec_add1 (im4->add_del_interface_address_callbacks, cb); + } + + { + ip6_add_del_interface_address_callback_t cb; + + cb.function = tcp_udp_lookup_ip6_add_del_interface_address; + cb.function_opaque = 0; + vec_add1 (im6->add_del_interface_address_callbacks, cb); + } + + tm->ip4.output_node_index = ip4_tcp_output_node.index; + tm->ip6.output_node_index = ip6_tcp_output_node.index; + + tcp_lookup_init (vm, tm); + tcp_options_decode_init (tm); + + tm->tx_buffer_free_list = VLIB_BUFFER_DEFAULT_FREE_LIST_INDEX; + tm->tx_buffer_free_list_n_buffer_bytes = VLIB_BUFFER_DEFAULT_FREE_LIST_BYTES; + + return 0; +} + +VLIB_INIT_FUNCTION (tcp_udp_lookup_init); + +static u8 * format_tcp_time_stamp (u8 * s, va_list * va) +{ + tcp_timer_type_t type = va_arg (*va, tcp_timer_type_t); + u32 value = va_arg (*va, u32); + vlib_main_t * vm = vlib_get_main(); + tcp_main_t * tm = &tcp_main; + u64 now; + f64 dt; + + now = clib_cpu_time_now (); + dt = vm->clib_time.seconds_per_clock * (now - (value << tm->log2_clocks_per_tick[type])); + return format (s, "%.4e sec", dt); +} + +static u8 * format_tcp_connection_state (u8 * s, va_list * va) +{ + tcp_connection_state_t st = va_arg (*va, tcp_connection_state_t); + char * t = 0; + switch (st) + { +#define _(f) case TCP_CONNECTION_STATE_##f: t = #f; break; + foreach_tcp_connection_state +#undef _ + default: break; + } + if (t) + s = format (s, "%s", t); + else + s = format (s, "unknown 0x%x", st); + + return s; +} + +static u8 * format_tcp_ip_4_or_6 (u8 * s, va_list * va) +{ + tcp_ip_4_or_6_t is_ip6 = va_arg (*va, tcp_ip_4_or_6_t); + return format (s, "%s", is_ip6 ? "ip6" : "ip4"); +} + +static u8 * format_tcp_mini_connection (u8 * s, va_list * va) +{ + tcp_mini_connection_t * c = va_arg (*va, tcp_mini_connection_t *); + + s = format (s, "state %U, window scale %d, mss %d", + format_tcp_connection_state, c->state, + c->window_scale, c->max_segment_size); + + return s; +} + +static u8 * format_ip4_tcp_mini_connection (u8 * s, va_list * va) +{ + u32 imin = va_arg (*va, u32); + u32 imin_div, imin_mod; + tcp_main_t * tm = &tcp_main; + tcp_mini_connection_t * min; + ip4_tcp_udp_address_x4_and_timestamps_t * mina; + + imin_div = imin / 4; + imin_mod = imin % 4; + + mina = vec_elt_at_index (tm->ip4_mini_connection_address_hash, imin_div); + + s = format (s, "%U, age %U", + format_ip4_tcp_udp_address_x4, &mina->address_x4, imin_div, + format_tcp_time_stamp, TCP_TIMER_mini_connection, mina->time_stamps[imin_div]); + + min = vec_elt_at_index (tm->ip4.mini_connections, imin); + + s = format (s, "%U", format_tcp_mini_connection, min); + + return s; +} + +static u8 * format_ip6_tcp_mini_connection (u8 * s, va_list * va) +{ + u32 imin = va_arg (*va, u32); + u32 imin_div, imin_mod; + tcp_main_t * tm = &tcp_main; + tcp_mini_connection_t * min; + ip6_tcp_udp_address_x4_and_timestamps_t * mina; + + imin_div = imin / 4; + imin_mod = imin % 4; + + mina = vec_elt_at_index (tm->ip6_mini_connection_address_hash, imin_div); + + s = format (s, "%U, age %U", + format_ip6_tcp_udp_address_x4, &mina->address_x4, imin_div, + format_tcp_time_stamp, TCP_TIMER_mini_connection, mina->time_stamps[imin_div]); + + min = vec_elt_at_index (tm->ip6.mini_connections, imin); + + s = format (s, "%U", format_tcp_mini_connection, min); + + return s; +} + +static u8 * format_tcp_established_connection (u8 * s, va_list * va) +{ + tcp_connection_t * c = va_arg (*va, tcp_connection_t *); + + if (c->flags != 0) + { + s = format (s, ", flags: "); +#define _(f) if (c->flags & TCP_CONNECTION_FLAG_##f) s = format (s, "%s, ", #f); + foreach_tcp_connection_flag; +#undef _ + } + + if (tcp_round_trip_time_stats_is_valid (&c->round_trip_time_stats)) + { + f64 r[2]; + tcp_round_trip_time_stats_compute (&c->round_trip_time_stats, r); + s = format (s, ", rtt %.4e +- %.4e", + r[0], r[1]); + } + + return s; +} + +static u8 * format_ip4_tcp_established_connection (u8 * s, va_list * va) +{ + u32 iest = va_arg (*va, u32); + u32 iest_div, iest_mod; + tcp_main_t * tm = &tcp_main; + tcp_connection_t * est; + ip4_tcp_udp_address_x4_t * esta; + + iest_div = iest / 4; + iest_mod = iest % 4; + + esta = vec_elt_at_index (tm->ip4_established_connection_address_hash, iest_div); + est = vec_elt_at_index (tm->ip4.established_connections, iest); + + s = format (s, "%U%U", + format_ip4_tcp_udp_address_x4, esta, iest_mod, + format_tcp_established_connection, est); + + return s; +} + +static u8 * format_ip6_tcp_established_connection (u8 * s, va_list * va) +{ + u32 iest = va_arg (*va, u32); + u32 iest_div, iest_mod; + tcp_main_t * tm = &tcp_main; + tcp_connection_t * est; + ip6_tcp_udp_address_x4_t * esta; + + iest_div = iest / 4; + iest_mod = iest % 4; + + esta = vec_elt_at_index (tm->ip6_established_connection_address_hash, iest_div); + est = vec_elt_at_index (tm->ip6.established_connections, iest); + + s = format (s, "%U%U", + format_ip6_tcp_udp_address_x4, esta, iest_mod, + format_tcp_established_connection, est); + + return s; +} + +VLIB_CLI_COMMAND (vlib_cli_show_tcp_command, static) = { + .path = "show tcp", + .short_help = "Transmission control protocol (TCP) show commands", +}; + +static clib_error_t * +show_mini_connections (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) +{ + tcp_main_t * tm = &tcp_main; + ip46_tcp_main_t * tm46; + tcp_ip_4_or_6_t is_ip6 = TCP_IP4; + tcp_mini_connection_t * min; + ip6_tcp_udp_address_x4_and_timestamps_t * mina6; + ip4_tcp_udp_address_x4_and_timestamps_t * mina4; + clib_error_t * error = 0; + uword i, i0, i1, n_valid; + + if (unformat (input, "4")) + is_ip6 = TCP_IP4; + if (unformat (input, "6")) + is_ip6 = TCP_IP6; + + n_valid = 0; + tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + for (i = 0; i <= tm46->mini_connection_hash_mask; i++) + { + i0 = i / 4; + i1 = i % 4; + + min = vec_elt_at_index (tm46->mini_connections, i); + if (is_ip6) + { + mina6 = vec_elt_at_index (tm->ip6_mini_connection_address_hash, i0); + if (ip6_tcp_udp_address_x4_is_valid (&mina6->address_x4, i1)) + { + vlib_cli_output (vm, "%U", format_ip4_tcp_mini_connection, i); + n_valid += 1; + } + } + else + { + mina4 = vec_elt_at_index (tm->ip4_mini_connection_address_hash, i0); + if (ip4_tcp_udp_address_x4_is_valid (&mina4->address_x4, i1)) + { + vlib_cli_output (vm, "%U", format_ip6_tcp_mini_connection, i); + n_valid += 1; + } + } + } + + if (n_valid == 0) + vlib_cli_output (vm, "no %U mini tcp connections", format_tcp_ip_4_or_6, is_ip6); + + return error; +} + +VLIB_CLI_COMMAND (vlib_cli_show_tcp_mini_connections_command) = { + .path = "show tcp mini-connections", + .short_help = "Show not-yet established TCP connections", + .function = show_mini_connections, +}; + +static clib_error_t * +show_established_connections (vlib_main_t * vm, unformat_input_t * input, vlib_cli_command_t * cmd) +{ + tcp_main_t * tm = &tcp_main; + ip46_tcp_main_t * tm46; + tcp_ip_4_or_6_t is_ip6 = TCP_IP4; + tcp_connection_t * est; + ip6_tcp_udp_address_x4_t * esta6; + ip4_tcp_udp_address_x4_t * esta4; + clib_error_t * error = 0; + uword i, i0, i1, n_valid; + + if (unformat (input, "4")) + is_ip6 = TCP_IP4; + if (unformat (input, "6")) + is_ip6 = TCP_IP6; + + n_valid = 0; + tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + for (i = 0; i < vec_len (tm46->established_connections); i++) + { + i0 = i / 4; + i1 = i % 4; + + est = vec_elt_at_index (tm46->established_connections, i); + if (is_ip6) + { + esta6 = vec_elt_at_index (tm->ip6_established_connection_address_hash, i0); + if (ip6_tcp_udp_address_x4_is_valid (esta6, i1)) + { + vlib_cli_output (vm, "%U", format_ip6_tcp_established_connection, i); + n_valid += 1; + } + } + else + { + esta4 = vec_elt_at_index (tm->ip4_established_connection_address_hash, i0); + if (ip4_tcp_udp_address_x4_is_valid (esta4, i1)) + { + vlib_cli_output (vm, "%U", format_ip4_tcp_established_connection, i); + n_valid += 1; + } + } + } + + if (n_valid == 0) + vlib_cli_output (vm, "no %U established tcp connections", format_tcp_ip_4_or_6, is_ip6); + + return error; +} + +VLIB_CLI_COMMAND (vlib_cli_show_tcp_established_connections_command, static) = { + .path = "show tcp connections", + .short_help = "Show established TCP connections", + .function = show_established_connections, +}; + +#if 0 +uword +tcp_write (vlib_main_t * vm, u32 connection_handle, void * data, uword n_data_bytes) +{ + tcp_main_t * tm = &tcp_main; + tcp_ip_4_or_6_t is_ip6 = tcp_connection_is_ip6 (connection_handle); + ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + tcp_connection_t * c = vec_elt_at_index (tm46->established_connections, connection_handle / 2); + vlib_buffer_t * b; + u32 bi, bi_next, bi_start_of_packet; + ip_csum_t sum; + + b = 0; + bi = c->write_tail_buffer_index; + n_bytes_left_tail = 0; + if (bi != 0) + { + b = vlib_get_buffer (vm, bi); + n_bytes_left_tail = tm->tx_buffer_free_list_n_buffer_bytes - b->current_length; + } + + n_bytes_this_packet = c->write_tail_packet.n_data_bytes; + n_bytes_left_packet = c->max_segment_size - n_bytes_this_packet; + + n_data_left = n_data_bytes; + sum = c->write_tail_packet.data_ip_checksum; + + while (n_data_left > 0) + { + u32 n_copy; + + if (n_bytes_left_tail == 0) + { + if (! vlib_buffer_alloc_from_free_list (vm, &bi_next, 1, + tm->tx_buffer_free_list)) + return n_data_bytes - n_data_left; + + bi_start_of_packet = bi_next; + if (b) + { + b->flags |= VLIB_BUFFER_NEXT_PRESENT; + b->next_buffer = bi_next; + bi_start_of_packet = b->opaque[0]; + } + bi = bi_next; + b = vlib_get_buffer (vm, bi); + + /* Save away start of packet buffer in opaque. */ + b->opaque[0] = bi_start_of_packet; + + c->tail_buffer.buffer_index = bi; + n_bytes_left_tail = tm->tx_buffer_free_list_n_buffer_bytes; + } + + n_copy = n_data_left; + n_copy = clib_min (n_copy, n_bytes_left_tail); + n_copy = clib_min (n_copy, n_bytes_left_packet); + + sum = ip_csum_and_memcpy (sum, b->data + b->current_length, + data, n_copy); + + b->current_length += n_copy; + n_bytes_left_tail -= n_copy; + n_bytes_left_packet -= n_copy; + n_data_left -=- n_copy; + n_bytes_this_packet += n_copy; + + if (n_bytes_left_packet == 0) + { + bi_start_of_packet = b->opaque[0]; + + if (c->tail_packet.buffer_index != 0) + { + vlib_buffer_t * p = vlib_get_buffer (vm, c->tail_packet.buffer_index); + tcp_buffer_t * next = vlib_get_buffer_opaque (p); + next[0] = c->; + } + c->tail_packet.buffer_index = bi_start_of_packet; + } + } + + c->tail_buffer.buffer_index = bi; + c->tail_buffer.n_data_bytes = n_bytes_this_packet; + c->tail_buffer.data_ip_checksum = ip_csum_fold (sum); + + return 0; +} +#endif diff --git a/vnet/vnet/ip/tcp.h b/vnet/vnet/ip/tcp.h new file mode 100644 index 00000000000..98d8e34f0d5 --- /dev/null +++ b/vnet/vnet/ip/tcp.h @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/tcp.h: tcp protocol + * + * Copyright (c) 2011 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_tcp_protocol_h +#define included_tcp_protocol_h + +#include <vppinfra/vector.h> + +/* No support for e.g. Altivec. */ +#if defined (__SSE2__) +#define TCP_HAVE_VEC128 +#endif + +typedef union { + struct { + u16 src, dst; + }; + u32 as_u32; +} tcp_udp_ports_t; + +typedef union { +#ifdef TCP_HAVE_VEC128 + u32x4 as_u32x4; +#endif + tcp_udp_ports_t as_ports[4]; +} tcp_udp_ports_x4_t; + +typedef struct { + union { +#ifdef TCP_HAVE_VEC128 + u32x4 as_u32x4; +#endif + ip4_address_t as_ip4_address[4]; + } src, dst; + tcp_udp_ports_x4_t ports; +} ip4_tcp_udp_address_x4_t; + +typedef struct { + union { +#ifdef TCP_HAVE_VEC128 + u32x4 as_u32x4[4]; +#endif + u32 as_u32[4][4]; + } src, dst; + tcp_udp_ports_x4_t ports; +} ip6_tcp_udp_address_x4_t; + +typedef struct { + u32 his, ours; +} tcp_sequence_pair_t; + +/* Time stamps saved from options. */ +typedef struct { + u32 ours_host_byte_order, his_net_byte_order; +} tcp_time_stamp_pair_t; + +typedef struct { + ip4_tcp_udp_address_x4_t address_x4; + u32 time_stamps[4]; +} ip4_tcp_udp_address_x4_and_timestamps_t; + +typedef struct { + ip6_tcp_udp_address_x4_t address_x4; + u32 time_stamps[4]; +} ip6_tcp_udp_address_x4_and_timestamps_t; + +#define foreach_tcp_connection_state \ + /* unused */ \ + _ (unused) \ + /* Sent SYN-ACK waiting for ACK if he ever feels like sending one. */ \ + _ (listen_ack_wait) \ + /* Sent SYN waiting for ACK or RST. */ \ + _ (connecting) \ + /* Pseudo-type for established connections. */ \ + _ (established) + +typedef enum { +#define _(f) TCP_CONNECTION_STATE_##f, + foreach_tcp_connection_state +#undef _ + TCP_N_CONNECTION_STATE, +} tcp_connection_state_t; + +/* Kept small to fight off syn flood attacks. */ +typedef struct { + tcp_sequence_pair_t sequence_numbers; + + tcp_time_stamp_pair_t time_stamps; + + /* segment size and window scale (saved from options + or set to defaults). */ + u16 max_segment_size; + + u8 window_scale; + + tcp_connection_state_t state : 8; +} tcp_mini_connection_t; + +typedef struct { + /* Sum and sum^2 of measurements. + Used to compute average and RMS. */ + f64 sum, sum2; + + /* Number of measurements. */ + f64 count; +} tcp_round_trip_time_stats_t; + +typedef struct { + u32 first_buffer_index_this_packet; + + u16 data_ip_checksum; + + u16 n_data_bytes; +} tcp_tx_packet_t; + +typedef struct { + tcp_sequence_pair_t sequence_numbers; + + tcp_time_stamp_pair_t time_stamps; + + tcp_tx_packet_t head_packet, tx_tail_packet, write_tail_packet; + + u32 write_tail_buffer_index; + + tcp_round_trip_time_stats_t round_trip_time_stats; + + /* Number of un-acknowledged bytes we've sent. */ + u32 n_tx_unacked_bytes; + + /* segment size and window scale (saved from options + or set to defaults). */ + u16 max_segment_size; + + /* Window from latest received packet. */ + u16 his_window; + + u16 my_window; + + u8 his_window_scale; + + u8 my_window_scale; + + /* ip4/ip6 tos/ttl to use for packets we send. */ + u8 tos, ttl; + + u16 flags; +#define foreach_tcp_connection_flag \ + _ (ack_pending) \ + _ (fin_received) \ + _ (fin_sent) \ + _ (application_requested_close) + + u8 listener_opaque[128 + - 1 * sizeof (tcp_sequence_pair_t) + - 1 * sizeof (tcp_time_stamp_pair_t) + - 3 * sizeof (tcp_tx_packet_t) + - 1 * sizeof (tcp_round_trip_time_stats_t) + - 2 * sizeof (u32) + - 4 * sizeof (u16) + - 4 * sizeof (u8)]; +} tcp_connection_t; + +typedef enum { + TCP_IP4, + TCP_IP6, + TCP_N_IP46, +} tcp_ip_4_or_6_t; + +typedef enum { +#define _(f) LOG2_TCP_CONNECTION_FLAG_##f, + foreach_tcp_connection_flag +#undef _ + N_TCP_CONNECTION_FLAG, +#define _(f) TCP_CONNECTION_FLAG_##f = 1 << LOG2_TCP_CONNECTION_FLAG_##f, + foreach_tcp_connection_flag +#undef _ +} tcp_connection_flag_t; + +typedef enum { + TCP_PACKET_TEMPLATE_SYN, + TCP_PACKET_TEMPLATE_SYN_ACK, + TCP_PACKET_TEMPLATE_ACK, + TCP_PACKET_TEMPLATE_FIN_ACK, + TCP_PACKET_TEMPLATE_RST_ACK, + TCP_N_PACKET_TEMPLATE, +} tcp_packet_template_type_t; + +typedef struct { + vlib_packet_template_t vlib; + + /* TCP checksum of template with zeros for all + variable fields. Network byte order. */ + u16 tcp_checksum_net_byte_order; + + /* IP4 checksum. */ + u16 ip4_checksum_net_byte_order; +} tcp_packet_template_t; + +typedef struct { + u8 log2_n_mini_connection_hash_elts; + u8 log2_n_established_connection_hash_elts; + u8 is_ip6; + + u32 mini_connection_hash_mask; + u32 established_connection_hash_mask; + + uword * established_connection_overflow_hash; + + tcp_mini_connection_t * mini_connections; + + tcp_connection_t * established_connections; + + /* Vector of established connection indices which need ACKs sent. */ + u32 * connections_pending_acks; + + /* Default valid_local_adjacency_bitmap for listeners who want to listen + for a given port in on all interfaces. */ + uword * default_valid_local_adjacency_bitmap; + + u32 output_node_index; + + tcp_packet_template_t packet_templates[TCP_N_PACKET_TEMPLATE]; +} ip46_tcp_main_t; + +#define foreach_tcp_event \ + /* Received a SYN-ACK after sending a SYN to connect. */ \ + _ (connection_established) \ + /* Received a reset (RST) after sending a SYN to connect. */ \ + _ (connect_failed) \ + /* Received a FIN from an established connection. */ \ + _ (fin_received) \ + _ (connection_closed) \ + /* Received a reset RST from an established connection. */ \ + _ (reset_received) + +typedef enum { +#define _(f) TCP_EVENT_##f, + foreach_tcp_event +#undef _ +} tcp_event_type_t; + +typedef void (tcp_event_function_t) + (u32 * connections, + tcp_event_type_t event_type); + +typedef struct { + /* Bitmap indicating which of local (interface) addresses + we should listen on for this destination port. */ + uword * valid_local_adjacency_bitmap; + + /* Destination tcp/udp port to listen for connections. */ + u16 dst_port; + + u16 next_index; + + u32 flags; + + /* Connection indices for which event in event_function applies to. */ + u32 * event_connections[TCP_N_IP46]; + u32 * eof_connections[TCP_N_IP46]; + u32 * close_connections[TCP_N_IP46]; + + tcp_event_function_t * event_function; +} tcp_listener_t; + +typedef struct { + u8 next, error; +} tcp_lookup_disposition_t; + +#define foreach_tcp_timer \ + /* Used to rank mini connections. */ \ + _ (mini_connection, 10e-3) \ + /* Used for timestamps. */ \ + _ (timestamp, 1e-6) + +typedef enum { +#define _(f,s) TCP_TIMER_##f, + foreach_tcp_timer +#undef _ + TCP_N_TIMER, +} tcp_timer_type_t; + +typedef struct { + ip46_tcp_main_t ip4, ip6; + + /* Array of non-established connections, but soon-to be established connections. */ + ip4_tcp_udp_address_x4_and_timestamps_t * ip4_mini_connection_address_hash; + ip6_tcp_udp_address_x4_and_timestamps_t * ip6_mini_connection_address_hash; + + /* Vector of size log2_n_established_connection_hash_elts plus overflow. */ + ip4_tcp_udp_address_x4_t * ip4_established_connection_address_hash; + ip6_tcp_udp_address_x4_t * ip6_established_connection_address_hash; + + /* Jenkins hash seeds for established and mini hash tables. */ + u32x4_union_t connection_hash_seeds[2][3]; + u32x4_union_t connection_hash_masks[2]; + + /* Pool of listeners. */ + tcp_listener_t * listener_pool; + + /* Table mapping destination port to listener index. */ + u16 * listener_index_by_dst_port; + + tcp_lookup_disposition_t disposition_by_state_and_flags[TCP_N_CONNECTION_STATE][64]; + + u8 log2_clocks_per_tick[TCP_N_TIMER]; + + f64 secs_per_tick[TCP_N_TIMER]; + + /* Holds pointers to default and per-packet TCP options while + parsing a TCP packet's options. */ + tcp_mini_connection_t option_decode_mini_connection_template; + + /* Count of currently established connections. */ + u32 n_established_connections[TCP_N_IP46]; + + u32 tx_buffer_free_list; + u32 tx_buffer_free_list_n_buffer_bytes; +} tcp_main_t; + +/* Global TCP main structure. */ +tcp_main_t tcp_main; + +typedef struct { + /* Listen on this port. */ + u16 port; + +#define TCP_LISTENER_IP4 (1 << 0) +#define TCP_LISTENER_IP6 (1 << 1) + u16 flags; + + /* Next node index for data packets. */ + u32 data_node_index; + + /* Event function: called on new connections, etc. */ + tcp_event_function_t * event_function; +} tcp_listener_registration_t; + +uword +tcp_register_listener (vlib_main_t * vm, tcp_listener_registration_t * r); + +always_inline tcp_ip_4_or_6_t +tcp_connection_is_ip6 (u32 h) +{ return h & 1; } + +always_inline tcp_ip_4_or_6_t +tcp_connection_handle_set (u32 iest, tcp_ip_4_or_6_t is_ip6) +{ return is_ip6 + 2*iest; } + +always_inline tcp_connection_t * +tcp_get_connection (u32 connection_handle) +{ + u32 iest = connection_handle / 2; + tcp_ip_4_or_6_t is_ip6 = tcp_connection_is_ip6 (connection_handle); + tcp_main_t * tm = &tcp_main; + ip46_tcp_main_t * tm46 = is_ip6 ? &tm->ip6 : &tm->ip4; + return vec_elt_at_index (tm46->established_connections, iest); +} + +#endif /* included_tcp_protocol_h */ diff --git a/vnet/vnet/ip/tcp_format.c b/vnet/vnet/ip/tcp_format.c new file mode 100644 index 00000000000..afc3dd20c49 --- /dev/null +++ b/vnet/vnet/ip/tcp_format.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/tcp_format.c: tcp formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +static u8 * format_tcp_flags (u8 * s, va_list * args) +{ + int flags = va_arg (*args, int); + +#define _(f) if (flags & TCP_FLAG_##f) s = format (s, "%s, ", #f); + foreach_tcp_flag +#undef _ + + return s; +} + +/* Format TCP header. */ +u8 * format_tcp_header (u8 * s, va_list * args) +{ + tcp_header_t * tcp = va_arg (*args, tcp_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + u32 header_bytes; + uword indent; + + /* Nothing to do. */ + if (max_header_bytes < sizeof (tcp[0])) + return format (s, "TCP header truncated"); + + indent = format_get_indent (s); + indent += 2; + + s = format (s, "TCP: %d -> %d", + clib_net_to_host_u16 (tcp->ports.src), + clib_net_to_host_u16 (tcp->ports.dst)); + + s = format (s, "\n%Useq. tx 0x%08x rx 0x%08x", + format_white_space, indent, + clib_net_to_host_u32 (tcp->seq_number), + clib_net_to_host_u32 (tcp->ack_number)); + + s = format (s, "\n%Uflags %U, tcp header: %d bytes", + format_white_space, indent, + format_tcp_flags, tcp->flags, + (tcp->tcp_header_u32s_and_reserved >> 4) * sizeof (u32)); + + s = format (s, "\n%Uwindow %d, checksum 0x%04x", + format_white_space, indent, + clib_net_to_host_u16 (tcp->window), + clib_net_to_host_u16 (tcp->checksum)); + + header_bytes = tcp_header_bytes (tcp); + + /* Format TCP options. */ +#if 0 + { + u8 * o; + u8 * option_start = (void *) (tcp + 1); + u8 * option_end = (void *) tcp + header_bytes; + + for (o = option_start; o < option_end; ) + { + u32 length = o[1]; + switch (o[0]) + { + case TCP_OPTION_END: + length = 1; + o = option_end; + break; + + case TCP_OPTION_NOP: + length = 1; + break; + + } + } + } +#endif + + /* Recurse into next protocol layer. */ + if (max_header_bytes != 0 && header_bytes < max_header_bytes) + { + ip_main_t * im = &ip_main; + tcp_udp_port_info_t * pi; + + pi = ip_get_tcp_udp_port_info (im, tcp->ports.dst); + + if (pi && pi->format_header) + s = format (s, "\n%U%U", + format_white_space, indent - 2, + pi->format_header, + /* next protocol header */ (void*) tcp + header_bytes, + max_header_bytes - header_bytes); + } + + return s; +} diff --git a/vnet/vnet/ip/tcp_init.c b/vnet/vnet/ip/tcp_init.c new file mode 100644 index 00000000000..3e88d87e11e --- /dev/null +++ b/vnet/vnet/ip/tcp_init.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/tcp_init.c: tcp initialization + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vlib/vlib.h> +#include <vnet/ip/format.h> +#include <vnet/ip/ip.h> +#include <vnet/ip/tcp_packet.h> + +static clib_error_t * +tcp_init (vlib_main_t * vm) +{ + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi; + clib_error_t * error; + + error = vlib_call_init_function (vm, ip_main_init); + + if (! error) + { + pi = ip_get_protocol_info (im, IP_PROTOCOL_TCP); + pi->format_header = format_tcp_header; + + pi->unformat_pg_edit = unformat_pg_tcp_header; + } + + return 0; +} + +VLIB_INIT_FUNCTION (tcp_init); diff --git a/vnet/vnet/ip/tcp_packet.h b/vnet/vnet/ip/tcp_packet.h new file mode 100644 index 00000000000..ebb111572a0 --- /dev/null +++ b/vnet/vnet/ip/tcp_packet.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip4/tcp_packet.h: TCP packet format (see RFC 793) + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_tcp_packet_h +#define included_tcp_packet_h + +/* TCP flags bit 0 first. */ +#define foreach_tcp_flag \ + _ (FIN) \ + _ (SYN) \ + _ (RST) \ + _ (PSH) \ + _ (ACK) \ + _ (URG) \ + _ (ECE) \ + _ (CWR) + +enum { +#define _(f) TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ + TCP_N_FLAG_BITS, + +#define _(f) TCP_FLAG_##f = 1 << TCP_FLAG_BIT_##f, + foreach_tcp_flag +#undef _ +}; + +typedef struct { + /* Source and destination port. */ + union { + struct { + u16 src, dst; + }; + u32 src_and_dst; + } ports; + + /* Sequence and acknowledgment number. */ + u32 seq_number, ack_number; + + /* Size of TCP header in 32-bit units plus 4 reserved bits. */ + u8 tcp_header_u32s_and_reserved; + + /* see foreach_tcp_flag for enumation of tcp flags. */ + u8 flags; + + /* Current window advertised by sender. + This is the number of bytes sender is willing to receive + right now. */ + u16 window; + + /* Checksum of TCP pseudo header and data. */ + u16 checksum; + + u16 urgent_pointer; +} tcp_header_t; + +always_inline int +tcp_header_bytes (tcp_header_t * t) +{ return (t->tcp_header_u32s_and_reserved >> 4) * sizeof (u32); } + +/* TCP options. */ +typedef enum tcp_option_type { + TCP_OPTION_END = 0, + TCP_OPTION_NOP = 1, + TCP_OPTION_MSS = 2, + TCP_OPTION_WINDOW_SCALE = 3, + TCP_OPTION_SACK_PERMITTED = 4, + TCP_OPTION_SACK_BLOCK = 5, + TCP_OPTION_TIME_STAMP = 8, +} tcp_option_type_t; + +/* All except NOP and END have 1 byte length field. */ +typedef struct { + tcp_option_type_t type : 8; + + /* Length of this option in bytes. */ + u8 length; +} tcp_option_with_length_t; + +#endif /* included_tcp_packet_h */ + diff --git a/vnet/vnet/ip/tcp_pg.c b/vnet/vnet/ip/tcp_pg.c new file mode 100644 index 00000000000..122592d1594 --- /dev/null +++ b/vnet/vnet/ip/tcp_pg.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/tcp_pg: TCP packet-generator interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> +#include <vnet/pg/pg.h> + +static void +tcp_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, + u32 n_packets) +{ + vlib_main_t * vm = pg->vlib_main; + u32 ip_offset, tcp_offset; + + tcp_offset = g->start_byte_offset; + ip_offset = (g-1)->start_byte_offset; + + while (n_packets >= 1) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + tcp_header_t * tcp0; + ip_csum_t sum0; + u32 tcp_len0; + + p0 = vlib_get_buffer (vm, packets[0]); + n_packets -= 1; + packets += 1; + + ASSERT (p0->current_data == 0); + ip0 = (void *) (p0->data + ip_offset); + tcp0 = (void *) (p0->data + tcp_offset); + tcp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]); + + /* Initialize checksum with header. */ + if (BITS (sum0) == 32) + { + sum0 = clib_mem_unaligned (&ip0->src_address, u32); + sum0 = ip_csum_with_carry (sum0, clib_mem_unaligned (&ip0->dst_address, u32)); + } + else + sum0 = clib_mem_unaligned (&ip0->src_address, u64); + + sum0 = ip_csum_with_carry + (sum0, clib_host_to_net_u32 (tcp_len0 + (ip0->protocol << 16))); + + /* Invalidate possibly old checksum. */ + tcp0->checksum = 0; + + sum0 = ip_incremental_checksum_buffer (vm, p0, tcp_offset, tcp_len0, sum0); + + tcp0->checksum = ~ ip_csum_fold (sum0); + } +} + +typedef struct { + struct { pg_edit_t src, dst; } ports; + pg_edit_t seq_number, ack_number; + pg_edit_t tcp_header_u32s; +#define _(f) pg_edit_t f##_flag; + foreach_tcp_flag +#undef _ + pg_edit_t window; + pg_edit_t checksum; + pg_edit_t urgent_pointer; +} pg_tcp_header_t; + +static inline void +pg_tcp_header_init (pg_tcp_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, tcp_header_t, f); + _ (ports.src); + _ (ports.dst); + _ (seq_number); + _ (ack_number); + _ (window); + _ (checksum); + _ (urgent_pointer); +#undef _ + + /* Initialize bit fields. */ +#define _(f) \ + pg_edit_init_bitfield (&p->f##_flag, tcp_header_t, \ + flags, \ + TCP_FLAG_BIT_##f, 1); + + foreach_tcp_flag +#undef _ + + pg_edit_init_bitfield (&p->tcp_header_u32s, tcp_header_t, + tcp_header_u32s_and_reserved, + 4, 4); +} + +uword +unformat_pg_tcp_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t * s = va_arg (*args, pg_stream_t *); + pg_tcp_header_t * p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (tcp_header_t), + &group_index); + pg_tcp_header_init (p); + + /* Defaults. */ + pg_edit_set_fixed (&p->seq_number, 0); + pg_edit_set_fixed (&p->ack_number, 0); + + pg_edit_set_fixed (&p->tcp_header_u32s, sizeof (tcp_header_t) / sizeof (u32)); + + pg_edit_set_fixed (&p->window, 4096); + pg_edit_set_fixed (&p->urgent_pointer, 0); + +#define _(f) pg_edit_set_fixed (&p->f##_flag, 0); + foreach_tcp_flag +#undef _ + + p->checksum.type = PG_EDIT_UNSPECIFIED; + + if (! unformat (input, "TCP: %U -> %U", + unformat_pg_edit, + unformat_tcp_udp_port, &p->ports.src, + unformat_pg_edit, + unformat_tcp_udp_port, &p->ports.dst)) + goto error; + + /* Parse options. */ + while (1) + { + if (unformat (input, "window %U", + unformat_pg_edit, + unformat_pg_number, &p->window)) + ; + + else if (unformat (input, "checksum %U", + unformat_pg_edit, + unformat_pg_number, &p->checksum)) + ; + + /* Flags. */ +#define _(f) else if (unformat (input, #f)) pg_edit_set_fixed (&p->f##_flag, 1); + foreach_tcp_flag +#undef _ + + /* Can't parse input: try next protocol level. */ + else + break; + } + + { + ip_main_t * im = &ip_main; + u16 dst_port; + tcp_udp_port_info_t * pi; + + pi = 0; + if (p->ports.dst.type == PG_EDIT_FIXED) + { + dst_port = pg_edit_get_value (&p->ports.dst, PG_EDIT_LO); + pi = ip_get_tcp_udp_port_info (im, dst_port); + } + + if (pi && pi->unformat_pg_edit + && unformat_user (input, pi->unformat_pg_edit, s)) + ; + + else if (! unformat_user (input, unformat_pg_payload, s)) + goto error; + + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t * g = pg_stream_get_group (s, group_index); + g->edit_function = tcp_pg_edit_function; + g->edit_function_opaque = 0; + } + + return 1; + } + + error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + diff --git a/vnet/vnet/ip/udp.h b/vnet/vnet/ip/udp.h new file mode 100644 index 00000000000..65eef29cb10 --- /dev/null +++ b/vnet/vnet/ip/udp.h @@ -0,0 +1,113 @@ +/* + * ip/udp.h: udp protocol + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef included_udp_h +#define included_udp_h + +#include <vnet/vnet.h> +#include <vnet/ip/udp_packet.h> +#include <vnet/ip/ip.h> +#include <vnet/ip/ip4.h> +#include <vnet/ip/ip4_packet.h> +#include <vnet/pg/pg.h> +#include <vnet/ip/format.h> + +typedef enum { +#define udp_error(n,s) UDP_ERROR_##n, +#include <vnet/ip/udp_error.def> +#undef udp_error + UDP_N_ERROR, +} udp_error_t; + +#define foreach_udp4_dst_port \ +_ (67, dhcp_to_server) \ +_ (68, dhcp_to_client) \ +_ (500, ikev2) \ +_ (4341, lisp_gpe) \ +_ (4739, ipfix) \ +_ (4789, vxlan) \ +_ (4790, vxlan_gpe) \ +_ (6633, vpath_3) + + +#define foreach_udp6_dst_port \ +_ (547, dhcpv6_to_server) \ +_ (546, dhcpv6_to_client) \ +_ (6633, vpath6_3) + +typedef enum { +#define _(n,f) UDP_DST_PORT_##f = n, + foreach_udp4_dst_port + foreach_udp6_dst_port +#undef _ +} udp_dst_port_t; + +typedef enum { +#define _(n,f) UDP6_DST_PORT_##f = n, + foreach_udp6_dst_port +#undef _ +} udp6_dst_port_t; + +typedef struct { + /* Name (a c string). */ + char * name; + + /* GRE protocol type in host byte order. */ + udp_dst_port_t dst_port; + + /* Node which handles this type. */ + u32 node_index; + + /* Next index for this type. */ + u32 next_index; +} udp_dst_port_info_t; + +typedef enum { + UDP_IP6 = 0, + UDP_IP4, /* the code is full of is_ip4... */ + N_UDP_AF, +} udp_af_t; + +typedef struct { + udp_dst_port_info_t * dst_port_infos [N_UDP_AF]; + + /* Hash tables mapping name/protocol to protocol info index. */ + uword * dst_port_info_by_name[N_UDP_AF]; + uword * dst_port_info_by_dst_port[N_UDP_AF]; + + /* convenience */ + vlib_main_t * vlib_main; +} udp_main_t; + +always_inline udp_dst_port_info_t * +udp_get_dst_port_info (udp_main_t * um, udp_dst_port_t dst_port, u8 is_ip4) +{ + uword * p = hash_get (um->dst_port_info_by_dst_port[is_ip4], dst_port); + return p ? vec_elt_at_index (um->dst_port_infos[is_ip4], p[0]) : 0; +} + +format_function_t format_udp_header; +format_function_t format_udp_rx_trace; + +unformat_function_t unformat_udp_header; + +void udp_register_dst_port (vlib_main_t * vm, + udp_dst_port_t dst_port, + u32 node_index, u8 is_ip4); + +#endif /* included_udp_h */ + diff --git a/vnet/vnet/ip/udp_error.def b/vnet/vnet/ip/udp_error.def new file mode 100644 index 00000000000..46e3bd9ef47 --- /dev/null +++ b/vnet/vnet/ip/udp_error.def @@ -0,0 +1,20 @@ +/* + * udp_error.def: gre errors + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +udp_error (NONE, "no error") +udp_error (NO_LISTENER, "no listener for dst port") +udp_error (LENGTH_ERROR, "UDP packets with length errors") diff --git a/vnet/vnet/ip/udp_format.c b/vnet/vnet/ip/udp_format.c new file mode 100644 index 00000000000..dd54095908c --- /dev/null +++ b/vnet/vnet/ip/udp_format.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/udp_format.c: udp formatting + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +/* Format UDP header. */ +u8 * format_udp_header (u8 * s, va_list * args) +{ + udp_header_t * udp = va_arg (*args, udp_header_t *); + u32 max_header_bytes = va_arg (*args, u32); + uword indent; + u32 header_bytes = sizeof (udp[0]); + + /* Nothing to do. */ + if (max_header_bytes < sizeof (udp[0])) + return format (s, "UDP header truncated"); + + indent = format_get_indent (s); + indent += 2; + + s = format (s, "UDP: %d -> %d", + clib_net_to_host_u16 (udp->src_port), + clib_net_to_host_u16 (udp->dst_port)); + + s = format (s, "\n%Ulength %d, checksum 0x%04x", + format_white_space, indent, + clib_net_to_host_u16 (udp->length), + clib_net_to_host_u16 (udp->checksum)); + + /* Recurse into next protocol layer. */ + if (max_header_bytes != 0 && header_bytes < max_header_bytes) + { + ip_main_t * im = &ip_main; + tcp_udp_port_info_t * pi; + + pi = ip_get_tcp_udp_port_info (im, udp->dst_port); + + if (pi && pi->format_header) + s = format (s, "\n%U%U", + format_white_space, indent - 2, + pi->format_header, + /* next protocol header */ (udp + 1), + max_header_bytes - sizeof (udp[0])); + } + + return s; +} diff --git a/vnet/vnet/ip/udp_init.c b/vnet/vnet/ip/udp_init.c new file mode 100644 index 00000000000..40ca032923c --- /dev/null +++ b/vnet/vnet/ip/udp_init.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/udp_init.c: udp initialization + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/ip/ip.h> + +clib_error_t * +udp_init (vlib_main_t * vm) +{ + ip_main_t * im = &ip_main; + ip_protocol_info_t * pi; + clib_error_t * error; + + error = vlib_call_init_function (vm, ip_main_init); + + if (! error) + { + pi = ip_get_protocol_info (im, IP_PROTOCOL_UDP); + if (pi == 0) + return clib_error_return (0, "UDP protocol info AWOL"); + pi->format_header = format_udp_header; + pi->unformat_pg_edit = unformat_pg_udp_header; + } + + return 0; +} + +VLIB_INIT_FUNCTION (udp_init); diff --git a/vnet/vnet/ip/udp_local.c b/vnet/vnet/ip/udp_local.c new file mode 100644 index 00000000000..c9355d2a322 --- /dev/null +++ b/vnet/vnet/ip/udp_local.c @@ -0,0 +1,508 @@ +/* + * node.c: udp packet processing + * + * Copyright (c) 2013 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <vlib/vlib.h> +#include <vnet/pg/pg.h> +#include <vnet/ip/udp.h> +#include <vnet/ip/udp_packet.h> +#include <vppinfra/sparse_vec.h> + +udp_main_t udp_main; + +#define foreach_udp_input_next \ + _ (PUNT, "error-punt") \ + _ (DROP, "error-drop") + +typedef enum { +#define _(s,n) UDP_INPUT_NEXT_##s, + foreach_udp_input_next +#undef _ + UDP_INPUT_N_NEXT, +} udp_input_next_t; + +typedef struct { + u16 src_port; + u16 dst_port; +} udp_rx_trace_t; + +u8 * format_udp_rx_trace (u8 * s, va_list * args) +{ + CLIB_UNUSED (vlib_main_t * vm) = va_arg (*args, vlib_main_t *); + CLIB_UNUSED (vlib_node_t * node) = va_arg (*args, vlib_node_t *); + udp_rx_trace_t * t = va_arg (*args, udp_rx_trace_t *); + + s = format (s, "UDP: src-port %d dst-port %d", + clib_net_to_host_u16(t->src_port), + clib_net_to_host_u16(t->dst_port)); + return s; +} + +typedef struct { + /* Sparse vector mapping udp dst_port in network byte order + to next index. */ + u16 * next_by_dst_port; + + u32 * sparse_index_by_next_index; +} udp_input_runtime_t; + +vlib_node_registration_t udp4_input_node; +vlib_node_registration_t udp6_input_node; + +always_inline uword +udp46_input_inline (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame, + int is_ip4) +{ + udp_input_runtime_t * rt = is_ip4 ? + (void *) vlib_node_get_runtime_data (vm, udp4_input_node.index) + : (void *) vlib_node_get_runtime_data (vm, udp6_input_node.index); + __attribute__((unused)) u32 n_left_from, next_index, i_next, * from, * to_next; + + from = vlib_frame_vector_args (from_frame); + n_left_from = from_frame->n_vectors; + + next_index = node->cached_next_index; + i_next = vec_elt (rt->sparse_index_by_next_index, next_index); + + while (n_left_from > 0) + { + u32 n_left_to_next; + + vlib_get_next_frame (vm, node, next_index, + to_next, n_left_to_next); + + while (n_left_from >= 4 && n_left_to_next >= 2) + { + u32 bi0, bi1; + vlib_buffer_t * b0, * b1; + udp_header_t * h0 = 0, * h1 = 0; + u32 i0, i1, dst_port0, dst_port1; + u32 advance0, advance1; + u32 error0, next0, error1, next1; + + /* Prefetch next iteration. */ + { + vlib_buffer_t * p2, * p3; + + p2 = vlib_get_buffer (vm, from[2]); + p3 = vlib_get_buffer (vm, from[3]); + + vlib_prefetch_buffer_header (p2, LOAD); + vlib_prefetch_buffer_header (p3, LOAD); + + CLIB_PREFETCH (p2->data, sizeof (h0[0]), LOAD); + CLIB_PREFETCH (p3->data, sizeof (h1[0]), LOAD); + } + + bi0 = from[0]; + bi1 = from[1]; + to_next[0] = bi0; + to_next[1] = bi1; + from += 2; + to_next += 2; + n_left_to_next -= 2; + n_left_from -= 2; + + b0 = vlib_get_buffer (vm, bi0); + b1 = vlib_get_buffer (vm, bi1); + + /* ip4/6_local hands us the ip header, not the udp header */ + if (is_ip4) + { + advance0 = sizeof(ip4_header_t); + advance1 = sizeof(ip4_header_t); + } + else + { + advance0 = sizeof(ip6_header_t); + advance1 = sizeof(ip6_header_t); + } + + if (PREDICT_FALSE(b0->current_length < advance0 + sizeof (h0))) + { + error0 = UDP_ERROR_LENGTH_ERROR; + next0 = UDP_INPUT_NEXT_DROP; + } + else + { + vlib_buffer_advance (b0, advance0); + h0 = vlib_buffer_get_current (b0); + error0 = next0 = 0; + } + + if (PREDICT_FALSE(b1->current_length < advance1 + sizeof (h1))) + { + error1 = UDP_ERROR_LENGTH_ERROR; + next1 = UDP_INPUT_NEXT_DROP; + } + else + { + vlib_buffer_advance (b1, advance1); + h1 = vlib_buffer_get_current (b1); + error1 = next1 = 0; + } + + + /* Index sparse array with network byte order. */ + dst_port0 = (error0 == 0) ? h0->dst_port : 0; + dst_port1 = (error1 == 0) ? h1->dst_port : 0; + sparse_vec_index2 (rt->next_by_dst_port, dst_port0, dst_port1, + &i0, &i1); + next0 = (error0 == 0) ? vec_elt(rt->next_by_dst_port, i0) : next0; + next1 = (error1 == 0) ? vec_elt(rt->next_by_dst_port, i1) : next1; + + if (PREDICT_TRUE (error0 == 0)) + b0->error = node->errors[next0 == SPARSE_VEC_INVALID_INDEX ? UDP_ERROR_NO_LISTENER : UDP_ERROR_NONE]; + if (PREDICT_TRUE (error1 == 0)) + b1->error = node->errors[next1 == SPARSE_VEC_INVALID_INDEX ? UDP_ERROR_NO_LISTENER : UDP_ERROR_NONE]; + + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + udp_rx_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) + { + tr->src_port = h0->src_port; + tr->dst_port = h0->dst_port; + } + } + if (PREDICT_FALSE(b1->flags & VLIB_BUFFER_IS_TRACED)) + { + udp_rx_trace_t *tr = vlib_add_trace (vm, node, + b1, sizeof (*tr)); + if (b1->error != node->errors[UDP_ERROR_LENGTH_ERROR]) + { + tr->src_port = h1->src_port; + tr->dst_port = h1->dst_port; + } + } + + vlib_buffer_advance (b0, sizeof (*h0)); + vlib_buffer_advance (b1, sizeof (*h1)); + + vlib_validate_buffer_enqueue_x2 (vm, node, next_index, + to_next, n_left_to_next, + bi0, bi1, next0, next1); + } + + while (n_left_from > 0 && n_left_to_next > 0) + { + u32 bi0; + vlib_buffer_t * b0; + udp_header_t * h0 = 0; + u32 i0, next0; + u32 advance0; + + bi0 = from[0]; + to_next[0] = bi0; + from += 1; + to_next += 1; + n_left_from -= 1; + n_left_to_next -= 1; + + b0 = vlib_get_buffer (vm, bi0); + + /* ip4/6_local hands us the ip header, not the udp header */ + if (is_ip4) + advance0 = sizeof(ip4_header_t); + else + advance0 = sizeof(ip6_header_t); + + if (PREDICT_FALSE(b0->current_length < advance0 + sizeof (h0))) + { + b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; + next0 = UDP_INPUT_NEXT_DROP; + goto trace_x1; + } + + vlib_buffer_advance (b0, advance0); + + h0 = vlib_buffer_get_current (b0); + + if (PREDICT_TRUE + (clib_net_to_host_u16(h0->length) <= b0->current_length)) + { + i0 = sparse_vec_index (rt->next_by_dst_port, h0->dst_port); + next0 = vec_elt(rt->next_by_dst_port, i0); + + b0->error = node->errors [next0 == SPARSE_VEC_INVALID_INDEX ? UDP_ERROR_NO_LISTENER : UDP_ERROR_NONE]; + } + else + { + b0->error = node->errors[UDP_ERROR_LENGTH_ERROR]; + next0 = UDP_INPUT_NEXT_DROP; + } + + trace_x1: + if (PREDICT_FALSE(b0->flags & VLIB_BUFFER_IS_TRACED)) + { + udp_rx_trace_t *tr = vlib_add_trace (vm, node, + b0, sizeof (*tr)); + if (b0->error != node->errors[UDP_ERROR_LENGTH_ERROR]) + { + tr->src_port = h0->src_port; + tr->dst_port = h0->dst_port; + } + } + vlib_buffer_advance (b0, sizeof (*h0)); + vlib_validate_buffer_enqueue_x1 (vm, node, next_index, + to_next, n_left_to_next, + bi0, next0); + } + + vlib_put_next_frame (vm, node, next_index, n_left_to_next); + } + return from_frame->n_vectors; +} + +static char * udp_error_strings[] = { +#define udp_error(n,s) s, +#include "udp_error.def" +#undef udp_error +}; + +static uword +udp4_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return udp46_input_inline (vm, node, from_frame, 1 /* is_ip4 */); +} + +static uword +udp6_input (vlib_main_t * vm, + vlib_node_runtime_t * node, + vlib_frame_t * from_frame) +{ + return udp46_input_inline (vm, node, from_frame, 0 /* is_ip4 */); +} + + +VLIB_REGISTER_NODE (udp4_input_node) = { + .function = udp4_input, + .name = "ip4-udp-lookup", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .runtime_data_bytes = sizeof (udp_input_runtime_t), + + .n_errors = UDP_N_ERROR, + .error_strings = udp_error_strings, + + .n_next_nodes = UDP_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [UDP_INPUT_NEXT_##s] = n, + foreach_udp_input_next +#undef _ + }, + + .format_buffer = format_udp_header, + .format_trace = format_udp_rx_trace, + .unformat_buffer = unformat_udp_header, +}; + +VLIB_REGISTER_NODE (udp6_input_node) = { + .function = udp6_input, + .name = "ip6-udp-lookup", + /* Takes a vector of packets. */ + .vector_size = sizeof (u32), + + .runtime_data_bytes = sizeof (udp_input_runtime_t), + + .n_errors = UDP_N_ERROR, + .error_strings = udp_error_strings, + + .n_next_nodes = UDP_INPUT_N_NEXT, + .next_nodes = { +#define _(s,n) [UDP_INPUT_NEXT_##s] = n, + foreach_udp_input_next +#undef _ + }, + + .format_buffer = format_udp_header, + .format_trace = format_udp_rx_trace, + .unformat_buffer = unformat_udp_header, +}; + +static void add_dst_port (udp_main_t * um, + udp_dst_port_t dst_port, + char * dst_port_name, u8 is_ip4) +{ + udp_dst_port_info_t * pi; + u32 i; + + vec_add2 (um->dst_port_infos[is_ip4], pi, 1); + i = pi - um->dst_port_infos[is_ip4]; + + pi->name = dst_port_name; + pi->dst_port = dst_port; + pi->next_index = pi->node_index = ~0; + + hash_set (um->dst_port_info_by_dst_port[is_ip4], dst_port, i); + + if (pi->name) + hash_set_mem (um->dst_port_info_by_name[is_ip4], pi->name, i); +} + +void +udp_register_dst_port (vlib_main_t * vm, + udp_dst_port_t dst_port, + u32 node_index, u8 is_ip4) +{ + udp_main_t * um = &udp_main; + udp_dst_port_info_t * pi; + udp_input_runtime_t * rt; + u16 * n; + u32 i; + + { + clib_error_t * error = vlib_call_init_function (vm, udp_local_init); + if (error) + clib_error_report (error); + } + + pi = udp_get_dst_port_info (um, dst_port, is_ip4); + if (! pi) + { + add_dst_port (um, dst_port, 0, is_ip4); + pi = udp_get_dst_port_info (um, dst_port, is_ip4); + ASSERT (pi); + } + + pi->node_index = node_index; + pi->next_index = vlib_node_add_next (vm, + is_ip4 ? udp4_input_node.index + : udp6_input_node.index, + node_index); + + /* Setup udp protocol -> next index sparse vector mapping. */ + rt = vlib_node_get_runtime_data + (vm, is_ip4 ? udp4_input_node.index: udp6_input_node.index); + n = sparse_vec_validate (rt->next_by_dst_port, + clib_host_to_net_u16 (dst_port)); + n[0] = pi->next_index; + + /* Rebuild next index -> sparse index inverse mapping when sparse vector + is updated. */ + vec_validate (rt->sparse_index_by_next_index, pi->next_index); + for (i = 1; i < vec_len (rt->next_by_dst_port); i++) + rt->sparse_index_by_next_index[rt->next_by_dst_port[i]] = i; +} + +/* Parse a UDP header. */ +uword unformat_udp_header (unformat_input_t * input, va_list * args) +{ + u8 ** result = va_arg (*args, u8 **); + udp_header_t * udp; + __attribute__((unused)) int old_length; + u16 src_port, dst_port; + + /* Allocate space for IP header. */ + { + void * p; + + old_length = vec_len (*result); + vec_add2 (*result, p, sizeof (ip4_header_t)); + udp = p; + } + + memset (udp, 0, sizeof (udp[0])); + if (unformat (input, "src-port %d dst-port %d", + &src_port, &dst_port)) + { + udp->src_port = clib_host_to_net_u16 (src_port); + udp->dst_port = clib_host_to_net_u16 (dst_port); + return 1; + } + return 0; +} + +static void +udp_setup_node (vlib_main_t * vm, u32 node_index) +{ + vlib_node_t * n = vlib_get_node (vm, node_index); + pg_node_t * pn = pg_get_node (node_index); + + n->format_buffer = format_udp_header; + n->unformat_buffer = unformat_udp_header; + pn->unformat_edit = unformat_pg_udp_header; +} + +clib_error_t * udp_local_init (vlib_main_t * vm) +{ + udp_input_runtime_t * rt; + udp_main_t * um = &udp_main; + int i; + + { + clib_error_t * error; + error = vlib_call_init_function (vm, udp_init); + if (error) + clib_error_report (error); + } + + + for (i = 0; i < 2; i++) + { + um->dst_port_info_by_name[i] = hash_create_string (0, sizeof(uword)); + um->dst_port_info_by_dst_port[i] = hash_create (0, sizeof(uword)); + } + + udp_setup_node (vm, udp4_input_node.index); + udp_setup_node (vm, udp6_input_node.index); + + rt = vlib_node_get_runtime_data (vm, udp4_input_node.index); + + rt->next_by_dst_port = sparse_vec_new + (/* elt bytes */ sizeof (rt->next_by_dst_port[0]), + /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); + + vec_validate (rt->sparse_index_by_next_index, UDP_INPUT_NEXT_DROP); + vec_validate (rt->sparse_index_by_next_index, UDP_INPUT_NEXT_PUNT); + rt->sparse_index_by_next_index[UDP_INPUT_NEXT_DROP] + = SPARSE_VEC_INVALID_INDEX; + rt->sparse_index_by_next_index[UDP_INPUT_NEXT_PUNT] + = SPARSE_VEC_INVALID_INDEX; + +#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 1 /* is_ip4 */); + foreach_udp4_dst_port +#undef _ + + rt = vlib_node_get_runtime_data (vm, udp6_input_node.index); + + rt->next_by_dst_port = sparse_vec_new + (/* elt bytes */ sizeof (rt->next_by_dst_port[0]), + /* bits in index */ BITS (((udp_header_t *) 0)->dst_port)); + + vec_validate (rt->sparse_index_by_next_index, UDP_INPUT_NEXT_DROP); + vec_validate (rt->sparse_index_by_next_index, UDP_INPUT_NEXT_PUNT); + rt->sparse_index_by_next_index[UDP_INPUT_NEXT_DROP] + = SPARSE_VEC_INVALID_INDEX; + rt->sparse_index_by_next_index[UDP_INPUT_NEXT_PUNT] + = SPARSE_VEC_INVALID_INDEX; + +#define _(n,s) add_dst_port (um, UDP_DST_PORT_##s, #s, 0 /* is_ip4 */); + foreach_udp6_dst_port +#undef _ + + ip4_register_protocol (IP_PROTOCOL_UDP, udp4_input_node.index); + /* Note: ip6 differs from ip4, UDP is hotwired to ip6-udp-lookup */ + return 0; +} + +VLIB_INIT_FUNCTION (udp_local_init); diff --git a/vnet/vnet/ip/udp_packet.h b/vnet/vnet/ip/udp_packet.h new file mode 100644 index 00000000000..21c30c6eb71 --- /dev/null +++ b/vnet/vnet/ip/udp_packet.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip4/udp_packet.h: UDP packet format + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef included_udp_packet_h +#define included_udp_packet_h + +typedef struct { + /* Source and destination port. */ + u16 src_port, dst_port; + + /* Length of UDP header plus payload. */ + u16 length; + + /* Checksum of UDP pseudo-header and data or + zero if checksum is disabled. */ + u16 checksum; +} udp_header_t; + +#endif /* included_udp_packet_h */ + diff --git a/vnet/vnet/ip/udp_pg.c b/vnet/vnet/ip/udp_pg.c new file mode 100644 index 00000000000..a33a56294fb --- /dev/null +++ b/vnet/vnet/ip/udp_pg.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2015 Cisco and/or its affiliates. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * ip/udp_pg: UDP packet-generator interface + * + * Copyright (c) 2008 Eliot Dresselhaus + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <vnet/pg/pg.h> +#include <vnet/ip/ip.h> /* for unformat_udp_udp_port */ + +#define UDP_PG_EDIT_LENGTH (1 << 0) +#define UDP_PG_EDIT_CHECKSUM (1 << 1) + +always_inline void +udp_pg_edit_function_inline (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, + u32 n_packets, + u32 flags) +{ + vlib_main_t * vm = pg->vlib_main; + u32 ip_offset, udp_offset; + + udp_offset = g->start_byte_offset; + ip_offset = (g-1)->start_byte_offset; + + while (n_packets >= 1) + { + vlib_buffer_t * p0; + ip4_header_t * ip0; + udp_header_t * udp0; + u32 udp_len0; + + p0 = vlib_get_buffer (vm, packets[0]); + n_packets -= 1; + packets += 1; + + ip0 = (void *) (p0->data + ip_offset); + udp0 = (void *) (p0->data + udp_offset); + udp_len0 = clib_net_to_host_u16 (ip0->length) - sizeof (ip0[0]); + + if (flags & UDP_PG_EDIT_LENGTH) + udp0->length = + clib_net_to_host_u16 (vlib_buffer_length_in_chain (vm, p0) + - ip_offset); + + /* Initialize checksum with header. */ + if (flags & UDP_PG_EDIT_CHECKSUM) + { + ip_csum_t sum0; + + sum0 = clib_mem_unaligned (&ip0->src_address, u64); + + sum0 = ip_csum_with_carry + (sum0, clib_host_to_net_u32 (udp_len0 + (ip0->protocol << 16))); + + /* Invalidate possibly old checksum. */ + udp0->checksum = 0; + + sum0 = ip_incremental_checksum_buffer (vm, p0, udp_offset, udp_len0, sum0); + + sum0 = ~ ip_csum_fold (sum0); + + /* Zero checksum means checksumming disabled. */ + sum0 = sum0 != 0 ? sum0 : 0xffff; + + udp0->checksum = sum0; + } + } +} + +static void +udp_pg_edit_function (pg_main_t * pg, + pg_stream_t * s, + pg_edit_group_t * g, + u32 * packets, + u32 n_packets) +{ + switch (g->edit_function_opaque) + { + case UDP_PG_EDIT_LENGTH: + udp_pg_edit_function_inline (pg, s, g, packets, n_packets, + UDP_PG_EDIT_LENGTH); + break; + + case UDP_PG_EDIT_CHECKSUM: + udp_pg_edit_function_inline (pg, s, g, packets, n_packets, + UDP_PG_EDIT_CHECKSUM); + break; + + case UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH: + udp_pg_edit_function_inline (pg, s, g, packets, n_packets, + UDP_PG_EDIT_CHECKSUM | UDP_PG_EDIT_LENGTH); + break; + + default: + ASSERT (0); + break; + } +} + +typedef struct { + pg_edit_t src_port, dst_port; + pg_edit_t length; + pg_edit_t checksum; +} pg_udp_header_t; + +static inline void +pg_udp_header_init (pg_udp_header_t * p) +{ + /* Initialize fields that are not bit fields in the IP header. */ +#define _(f) pg_edit_init (&p->f, udp_header_t, f); + _ (src_port); + _ (dst_port); + _ (length); + _ (checksum); +#undef _ +} + +uword +unformat_pg_udp_header (unformat_input_t * input, va_list * args) +{ + pg_stream_t * s = va_arg (*args, pg_stream_t *); + pg_udp_header_t * p; + u32 group_index; + + p = pg_create_edit_group (s, sizeof (p[0]), sizeof (udp_header_t), + &group_index); + pg_udp_header_init (p); + + /* Defaults. */ + p->checksum.type = PG_EDIT_UNSPECIFIED; + p->length.type = PG_EDIT_UNSPECIFIED; + + if (! unformat (input, "UDP: %U -> %U", + unformat_pg_edit, + unformat_tcp_udp_port, &p->src_port, + unformat_pg_edit, + unformat_tcp_udp_port, &p->dst_port)) + goto error; + + /* Parse options. */ + while (1) + { + if (unformat (input, "length %U", + unformat_pg_edit, + unformat_pg_number, &p->length)) + ; + + else if (unformat (input, "checksum %U", + unformat_pg_edit, + unformat_pg_number, &p->checksum)) + ; + + /* Can't parse input: try next protocol level. */ + else + break; + } + + { + ip_main_t * im = &ip_main; + u16 dst_port; + tcp_udp_port_info_t * pi; + + pi = 0; + if (p->dst_port.type == PG_EDIT_FIXED) + { + dst_port = pg_edit_get_value (&p->dst_port, PG_EDIT_LO); + pi = ip_get_tcp_udp_port_info (im, dst_port); + } + + if (pi && pi->unformat_pg_edit + && unformat_user (input, pi->unformat_pg_edit, s)) + ; + + else if (! unformat_user (input, unformat_pg_payload, s)) + goto error; + + p = pg_get_edit_group (s, group_index); + if (p->checksum.type == PG_EDIT_UNSPECIFIED + || p->length.type == PG_EDIT_UNSPECIFIED) + { + pg_edit_group_t * g = pg_stream_get_group (s, group_index); + g->edit_function = udp_pg_edit_function; + g->edit_function_opaque = 0; + if (p->checksum.type == PG_EDIT_UNSPECIFIED) + g->edit_function_opaque |= UDP_PG_EDIT_CHECKSUM; + if (p->length.type == PG_EDIT_UNSPECIFIED) + g->edit_function_opaque |= UDP_PG_EDIT_LENGTH; + } + + return 1; + } + + error: + /* Free up any edits we may have added. */ + pg_free_edit_group (s); + return 0; +} + |