From 0b6a857d85df97e887de7aaf00fd6bd2dae39bf8 Mon Sep 17 00:00:00 2001 From: Neale Ranns Date: Wed, 30 Oct 2019 17:34:14 +0000 Subject: ip: Fragmentation fixes Type: fix if the packet is about to be fragmented, then don't call any of the actions that expect the rewrite to have been written. 1) don't double count packets thru the adjacency (original & fragments) 2) don't double decrement the TTL for fragments 3) return to ip4-midchain post ip-frag if that's where we started. 4) only run midchain/mcast fixups if not fragmenting (if no errors) Change-Id: Ib2866787a42713ee5871b87b597d8f74b901044b Signed-off-by: Neale Ranns --- src/vnet/interface_output.h | 6 +- src/vnet/ip/ip4_forward.c | 217 ++++++++++++++++++++++++-------------------- src/vnet/ip/ip6_forward.c | 2 +- src/vnet/ip/ip_frag.c | 30 ++++++ src/vnet/ip/ip_frag.h | 2 + src/vnet/ip/lookup.c | 19 +--- 6 files changed, 159 insertions(+), 117 deletions(-) (limited to 'src/vnet') diff --git a/src/vnet/interface_output.h b/src/vnet/interface_output.h index 58f7f617957..f1fa4d85b5e 100644 --- a/src/vnet/interface_output.h +++ b/src/vnet/interface_output.h @@ -62,7 +62,6 @@ calc_checksums (vlib_main_t * vm, vlib_buffer_t * b) if (is_ip4) { - ip4 = (ip4_header_t *) (b->data + vnet_buffer (b)->l3_hdr_offset); if (b->flags & VNET_BUFFER_F_OFFLOAD_IP_CKSUM) ip4->checksum = ip4_header_checksum (ip4); if (b->flags & VNET_BUFFER_F_OFFLOAD_TCP_CKSUM) @@ -71,7 +70,10 @@ calc_checksums (vlib_main_t * vm, vlib_buffer_t * b) th->checksum = ip4_tcp_udp_compute_checksum (vm, b, ip4); } if (b->flags & VNET_BUFFER_F_OFFLOAD_UDP_CKSUM) - uh->checksum = ip4_tcp_udp_compute_checksum (vm, b, ip4); + { + uh->checksum = 0; + uh->checksum = ip4_tcp_udp_compute_checksum (vm, b, ip4); + } } if (is_ip6) { diff --git a/src/vnet/ip/ip4_forward.c b/src/vnet/ip/ip4_forward.c index 40c396c4f3b..1550b313915 100644 --- a/src/vnet/ip/ip4_forward.c +++ b/src/vnet/ip/ip4_forward.c @@ -1203,7 +1203,7 @@ format_ip4_rewrite_trace (u8 * s, va_list * args) s = format (s, "\n%U%U", format_white_space, indent, format_ip_adjacency_packet_data, - t->dpo_index, t->packet_data, sizeof (t->packet_data)); + t->packet_data, sizeof (t->packet_data)); return s; } @@ -2293,7 +2293,8 @@ typedef enum always_inline void ip4_mtu_check (vlib_buffer_t * b, u16 packet_len, - u16 adj_packet_bytes, bool df, u16 * next, u32 * error) + u16 adj_packet_bytes, bool df, u16 * next, u32 * error, + u8 is_midchain) { if (packet_len > adj_packet_bytes) { @@ -2310,12 +2311,39 @@ ip4_mtu_check (vlib_buffer_t * b, u16 packet_len, { /* IP fragmentation */ ip_frag_set_vnet_buffer (b, adj_packet_bytes, - IP4_FRAG_NEXT_IP4_REWRITE, 0); + (is_midchain ? + IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN : + IP4_FRAG_NEXT_IP4_REWRITE), 0); *next = IP4_REWRITE_NEXT_FRAGMENT; } } } +/* increment TTL & update checksum. + Works either endian, so no need for byte swap. */ +static_always_inline void +ip4_ttl_inc (vlib_buffer_t * b, ip4_header_t * ip) +{ + i32 ttl; + u32 checksum; + if (PREDICT_FALSE (b->flags & VNET_BUFFER_F_LOCALLY_ORIGINATED)) + { + b->flags &= ~VNET_BUFFER_F_LOCALLY_ORIGINATED; + return; + } + + ttl = ip->ttl; + + checksum = ip->checksum - clib_host_to_net_u16 (0x0100); + checksum += checksum >= 0xffff; + + ip->checksum = checksum; + ttl += 1; + ip->ttl = ttl; + + ASSERT (ip->checksum == ip4_header_checksum (ip)); +} + /* Decrement TTL & update checksum. Works either endian, so no need for byte swap. */ static_always_inline void @@ -2458,12 +2486,12 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, adj0[0].rewrite_header.max_l3_packet_bytes, ip0->flags_and_fragment_offset & clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT), - next + 0, &error0); + next + 0, &error0, is_midchain); ip4_mtu_check (b[1], ip1_len, adj1[0].rewrite_header.max_l3_packet_bytes, ip1->flags_and_fragment_offset & clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT), - next + 1, &error1); + next + 1, &error1, is_midchain); if (is_mcast) { @@ -2481,6 +2509,7 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, { u32 next_index = adj0[0].rewrite_header.next_index; vlib_buffer_advance (b[0], -(word) rw_len0); + tx_sw_if_index0 = adj0[0].rewrite_header.sw_if_index; vnet_buffer (b[0])->sw_if_index[VLIB_TX] = tx_sw_if_index0; @@ -2489,10 +2518,14 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, vnet_feature_arc_start (lm->output_feature_arc_index, tx_sw_if_index0, &next_index, b[0]); next[0] = next_index; + if (is_midchain) + calc_checksums (vm, b[0]); } else { b[0]->error = error_node->errors[error0]; + if (error0 == IP4_ERROR_MTU_EXCEEDED) + ip4_ttl_inc (b[0], ip0); } if (PREDICT_TRUE (error1 == IP4_ERROR_NONE)) { @@ -2507,57 +2540,58 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, vnet_feature_arc_start (lm->output_feature_arc_index, tx_sw_if_index1, &next_index, b[1]); next[1] = next_index; + if (is_midchain) + calc_checksums (vm, b[1]); } else { b[1]->error = error_node->errors[error1]; + if (error1 == IP4_ERROR_MTU_EXCEEDED) + ip4_ttl_inc (b[1], ip1); } - if (is_midchain) - { - calc_checksums (vm, b[0]); - calc_checksums (vm, b[1]); - } + /* Guess we are only writing on simple Ethernet header. */ vnet_rewrite_two_headers (adj0[0], adj1[0], ip0, ip1, sizeof (ethernet_header_t)); - /* - * Bump the per-adjacency counters - */ if (do_counters) { - vlib_increment_combined_counter - (&adjacency_counters, - thread_index, - adj_index0, 1, vlib_buffer_length_in_chain (vm, b[0]) + rw_len0); - - vlib_increment_combined_counter - (&adjacency_counters, - thread_index, - adj_index1, 1, vlib_buffer_length_in_chain (vm, b[1]) + rw_len1); + if (error0 == IP4_ERROR_NONE) + vlib_increment_combined_counter + (&adjacency_counters, + thread_index, + adj_index0, 1, + vlib_buffer_length_in_chain (vm, b[0]) + rw_len0); + + if (error1 == IP4_ERROR_NONE) + vlib_increment_combined_counter + (&adjacency_counters, + thread_index, + adj_index1, 1, + vlib_buffer_length_in_chain (vm, b[1]) + rw_len1); } if (is_midchain) { - if (adj0->sub_type.midchain.fixup_func) + if (error0 == IP4_ERROR_NONE && adj0->sub_type.midchain.fixup_func) adj0->sub_type.midchain.fixup_func (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data); - if (adj1->sub_type.midchain.fixup_func) + if (error1 == IP4_ERROR_NONE && adj1->sub_type.midchain.fixup_func) adj1->sub_type.midchain.fixup_func (vm, adj1, b[1], adj1->sub_type.midchain.fixup_data); } if (is_mcast) { - /* - * copy bytes from the IP address into the MAC rewrite - */ - vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, - adj0->rewrite_header.dst_mcast_offset, - &ip0->dst_address.as_u32, (u8 *) ip0); - vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, - adj1->rewrite_header.dst_mcast_offset, - &ip1->dst_address.as_u32, (u8 *) ip1); + /* copy bytes from the IP address into the MAC rewrite */ + if (error0 == IP4_ERROR_NONE) + vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, + adj0->rewrite_header.dst_mcast_offset, + &ip0->dst_address.as_u32, (u8 *) ip0); + if (error1 == IP4_ERROR_NONE) + vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, + adj1->rewrite_header.dst_mcast_offset, + &ip1->dst_address.as_u32, (u8 *) ip1); } next += 2; @@ -2626,7 +2660,7 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, adj0[0].rewrite_header.max_l3_packet_bytes, ip0->flags_and_fragment_offset & clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT), - next + 0, &error0); + next + 0, &error0, is_midchain); if (is_mcast) { @@ -2649,44 +2683,38 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, vnet_feature_arc_start (lm->output_feature_arc_index, tx_sw_if_index0, &next_index, b[0]); next[0] = next_index; - } - else - { - b[0]->error = error_node->errors[error0]; - } - if (is_midchain) - { - calc_checksums (vm, b[0]); - } - /* Guess we are only writing on simple Ethernet header. */ - vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); - /* - * Bump the per-adjacency counters - */ - if (do_counters) - { - vlib_increment_combined_counter - (&adjacency_counters, - thread_index, - adj_index0, 1, vlib_buffer_length_in_chain (vm, b[0]) + rw_len0); - } + if (is_midchain) + calc_checksums (vm, b[0]); - if (is_midchain) - { - if (adj0->sub_type.midchain.fixup_func) + /* Guess we are only writing on simple Ethernet header. */ + vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); + + /* + * Bump the per-adjacency counters + */ + if (do_counters) + vlib_increment_combined_counter + (&adjacency_counters, + thread_index, + adj_index0, 1, vlib_buffer_length_in_chain (vm, + b[0]) + rw_len0); + + if (is_midchain && adj0->sub_type.midchain.fixup_func) adj0->sub_type.midchain.fixup_func (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data); - } - if (is_mcast) + if (is_mcast) + /* copy bytes from the IP address into the MAC rewrite */ + vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, + adj0->rewrite_header.dst_mcast_offset, + &ip0->dst_address.as_u32, (u8 *) ip0); + } + else { - /* - * copy bytes from the IP address into the MAC rewrite - */ - vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, - adj0->rewrite_header.dst_mcast_offset, - &ip0->dst_address.as_u32, (u8 *) ip0); + b[0]->error = error_node->errors[error0]; + if (error0 == IP4_ERROR_MTU_EXCEEDED) + ip4_ttl_inc (b[0], ip0); } next += 1; @@ -2730,7 +2758,7 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, adj0[0].rewrite_header.max_l3_packet_bytes, ip0->flags_and_fragment_offset & clib_host_to_net_u16 (IP4_HEADER_FLAG_DONT_FRAGMENT), - next + 0, &error0); + next + 0, &error0, is_midchain); if (is_mcast) { @@ -2753,39 +2781,36 @@ ip4_rewrite_inline_with_gso (vlib_main_t * vm, vnet_feature_arc_start (lm->output_feature_arc_index, tx_sw_if_index0, &next_index, b[0]); next[0] = next_index; - } - else - { - b[0]->error = error_node->errors[error0]; - } - if (is_midchain) - { - calc_checksums (vm, b[0]); - } - /* Guess we are only writing on simple Ethernet header. */ - vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); - if (do_counters) - vlib_increment_combined_counter - (&adjacency_counters, - thread_index, adj_index0, 1, - vlib_buffer_length_in_chain (vm, b[0]) + rw_len0); + if (is_midchain) + /* this acts on the packet that is about to be encapped */ + calc_checksums (vm, b[0]); - if (is_midchain) - { - if (adj0->sub_type.midchain.fixup_func) + /* Guess we are only writing on simple Ethernet header. */ + vnet_rewrite_one_header (adj0[0], ip0, sizeof (ethernet_header_t)); + + if (do_counters) + vlib_increment_combined_counter + (&adjacency_counters, + thread_index, adj_index0, 1, + vlib_buffer_length_in_chain (vm, b[0]) + rw_len0); + + if (is_midchain && adj0->sub_type.midchain.fixup_func) adj0->sub_type.midchain.fixup_func (vm, adj0, b[0], adj0->sub_type.midchain.fixup_data); - } - if (is_mcast) + if (is_mcast) + /* copy bytes from the IP address into the MAC rewrite */ + vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, + adj0->rewrite_header.dst_mcast_offset, + &ip0->dst_address.as_u32, (u8 *) ip0); + } + else { - /* - * copy bytes from the IP address into the MAC rewrite - */ - vnet_ip_mcast_fixup_header (IP4_MCAST_ADDR_MASK, - adj0->rewrite_header.dst_mcast_offset, - &ip0->dst_address.as_u32, (u8 *) ip0); + b[0]->error = error_node->errors[error0]; + /* undo the TTL decrement - we'll be back to do it again */ + if (error0 == IP4_ERROR_MTU_EXCEEDED) + ip4_ttl_inc (b[0], ip0); } next += 1; @@ -2943,8 +2968,8 @@ VLIB_REGISTER_NODE (ip4_mcast_midchain_node) = { VLIB_REGISTER_NODE (ip4_midchain_node) = { .name = "ip4-midchain", .vector_size = sizeof (u32), - .format_trace = format_ip4_forward_next_trace, - .sibling_of = "ip4-rewrite", + .format_trace = format_ip4_rewrite_trace, + .sibling_of = "ip4-rewrite", }; /* *INDENT-ON */ diff --git a/src/vnet/ip/ip6_forward.c b/src/vnet/ip/ip6_forward.c index 47fb57ae201..50de501fe0d 100644 --- a/src/vnet/ip/ip6_forward.c +++ b/src/vnet/ip/ip6_forward.c @@ -908,7 +908,7 @@ format_ip6_rewrite_trace (u8 * s, va_list * args) s = format (s, "\n%U%U", format_white_space, indent, format_ip_adjacency_packet_data, - t->adj_index, t->packet_data, sizeof (t->packet_data)); + t->packet_data, sizeof (t->packet_data)); return s; } diff --git a/src/vnet/ip/ip_frag.c b/src/vnet/ip/ip_frag.c index 230722c45db..54efb63c986 100644 --- a/src/vnet/ip/ip_frag.c +++ b/src/vnet/ip/ip_frag.c @@ -200,6 +200,17 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer, clib_memcpy_fast (to_b->data, org_from_packet, sizeof (ip4_header_t)); to_ip4 = vlib_buffer_get_current (to_b); to_data = (void *) (to_ip4 + 1); + vnet_buffer (to_b)->l3_hdr_offset = to_b->current_data; + to_b->flags |= VNET_BUFFER_F_L3_HDR_OFFSET_VALID; + + if (from_b->flags & VNET_BUFFER_F_L4_HDR_OFFSET_VALID) + { + vnet_buffer (to_b)->l4_hdr_offset = + (vnet_buffer (to_b)->l3_hdr_offset + + (vnet_buffer (from_b)->l4_hdr_offset - + vnet_buffer (from_b)->l3_hdr_offset)); + to_b->flags |= VNET_BUFFER_F_L4_HDR_OFFSET_VALID; + } /* Spin through from buffers filling up the to buffer */ u16 left_in_to_buffer = len, to_ptr = 0; @@ -232,6 +243,7 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer, } to_b->current_length = len + sizeof (ip4_header_t); + to_b->flags |= VNET_BUFFER_F_IS_IP4; to_ip4->fragment_id = ip_frag_id; to_ip4->flags_and_fragment_offset = @@ -241,6 +253,9 @@ ip4_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer, to_ip4->length = clib_host_to_net_u16 (len + sizeof (ip4_header_t)); to_ip4->checksum = ip4_header_checksum (to_ip4); + /* we've just done the IP checksum .. */ + to_b->flags &= ~VNET_BUFFER_F_OFFLOAD_IP_CKSUM; + if (vnet_buffer (org_from_b)->ip_frag.flags & IP_FRAG_FLAG_IP4_HEADER) { /* Encapsulating ipv4 header */ @@ -482,6 +497,19 @@ ip6_frag_do_fragment (vlib_main_t * vm, u32 from_bi, u32 ** buffer, to_frag_hdr = (ip6_frag_hdr_t *) (to_ip6 + 1); to_data = (void *) (to_frag_hdr + 1); + vnet_buffer (to_b)->l3_hdr_offset = to_b->current_data; + to_b->flags |= VNET_BUFFER_F_L3_HDR_OFFSET_VALID; + + if (from_b->flags & VNET_BUFFER_F_L4_HDR_OFFSET_VALID) + { + vnet_buffer (to_b)->l4_hdr_offset = + (vnet_buffer (to_b)->l3_hdr_offset + + (vnet_buffer (from_b)->l4_hdr_offset - + vnet_buffer (from_b)->l3_hdr_offset)); + to_b->flags |= VNET_BUFFER_F_L4_HDR_OFFSET_VALID; + } + to_b->flags |= VNET_BUFFER_F_IS_IP6; + /* Spin through from buffers filling up the to buffer */ u16 left_in_to_buffer = len, to_ptr = 0; while (1) @@ -551,6 +579,7 @@ VLIB_REGISTER_NODE (ip4_frag_node) = { .n_next_nodes = IP4_FRAG_N_NEXT, .next_nodes = { [IP4_FRAG_NEXT_IP4_REWRITE] = "ip4-rewrite", + [IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN] = "ip4-midchain", [IP4_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup", [IP4_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup", [IP4_FRAG_NEXT_MPLS_OUTPUT] = "mpls-output", @@ -574,6 +603,7 @@ VLIB_REGISTER_NODE (ip6_frag_node) = { .n_next_nodes = IP6_FRAG_N_NEXT, .next_nodes = { [IP6_FRAG_NEXT_IP6_REWRITE] = "ip6-rewrite", + [IP6_FRAG_NEXT_IP6_REWRITE_MIDCHAIN] = "ip6-midchain", [IP6_FRAG_NEXT_IP4_LOOKUP] = "ip4-lookup", [IP6_FRAG_NEXT_IP6_LOOKUP] = "ip6-lookup", [IP6_FRAG_NEXT_MPLS_OUTPUT] = "mpls-output", diff --git a/src/vnet/ip/ip_frag.h b/src/vnet/ip/ip_frag.h index b66db416129..ce4236b8465 100644 --- a/src/vnet/ip/ip_frag.h +++ b/src/vnet/ip/ip_frag.h @@ -50,6 +50,7 @@ extern vlib_node_registration_t ip6_frag_node; typedef enum { IP4_FRAG_NEXT_IP4_REWRITE, + IP4_FRAG_NEXT_IP4_REWRITE_MIDCHAIN, IP4_FRAG_NEXT_IP4_LOOKUP, IP4_FRAG_NEXT_IP6_LOOKUP, IP4_FRAG_NEXT_MPLS_OUTPUT, @@ -63,6 +64,7 @@ typedef enum IP6_FRAG_NEXT_IP4_LOOKUP, IP6_FRAG_NEXT_IP6_LOOKUP, IP6_FRAG_NEXT_IP6_REWRITE, + IP6_FRAG_NEXT_IP6_REWRITE_MIDCHAIN, IP6_FRAG_NEXT_MPLS_OUTPUT, IP6_FRAG_NEXT_DROP, IP6_FRAG_N_NEXT diff --git a/src/vnet/ip/lookup.c b/src/vnet/ip/lookup.c index 4db7660eea9..c1fbc429b97 100644 --- a/src/vnet/ip/lookup.c +++ b/src/vnet/ip/lookup.c @@ -258,27 +258,10 @@ format_ip_flow_hash_config (u8 * s, va_list * args) u8 * format_ip_adjacency_packet_data (u8 * s, va_list * args) { - u32 adj_index = va_arg (*args, u32); u8 *packet_data = va_arg (*args, u8 *); u32 n_packet_data_bytes = va_arg (*args, u32); - ip_adjacency_t *adj; - if (!adj_is_valid (adj_index)) - return format (s, ""); - - adj = adj_get (adj_index); - - switch (adj->lookup_next_index) - { - case IP_LOOKUP_NEXT_REWRITE: - case IP_LOOKUP_NEXT_MCAST: - s = - format (s, "%U", format_hex_bytes, packet_data, n_packet_data_bytes); - break; - - default: - break; - } + s = format (s, "%U", format_hex_bytes, packet_data, n_packet_data_bytes); return s; } -- cgit 1.2.3-korg