From 6534b7aa13bc5bed15ed87f47bb766405963e9e8 Mon Sep 17 00:00:00 2001 From: Florin Coras Date: Tue, 18 Jul 2017 05:38:03 -0400 Subject: Improvements to tcp rx path and debugging - Increment rcv_nxt for fin packets - Call tcp_segment_rcv only if buffer has data - Parse rcv opts before deleting half-open connection - Fix initial rcv_wnd - Improved event logging Change-Id: I9b83c04f432c4cec832c480b03e534deff02c3b1 Signed-off-by: Florin Coras --- src/vnet/tcp/tcp_input.c | 165 ++++++++++++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 66 deletions(-) (limited to 'src/vnet/tcp/tcp_input.c') diff --git a/src/vnet/tcp/tcp_input.c b/src/vnet/tcp/tcp_input.c index bc7d9015..cc5cecdc 100644 --- a/src/vnet/tcp/tcp_input.c +++ b/src/vnet/tcp/tcp_input.c @@ -349,7 +349,10 @@ tcp_segment_validate (vlib_main_t * vm, tcp_connection_t * tc0, /* 4th: check the SYN bit */ if (tcp_syn (th0)) { - tcp_send_reset (b0, tc0->c_is_ip4); + /* TODO implement RFC 5961 */ + tcp_make_ack (tc0, b0); + *next0 = tcp_next_output (tc0->c_is_ip4); + TCP_EVT_DBG (TCP_EVT_SYN_RCVD, tc0); return -1; } @@ -1246,8 +1249,6 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, * Looks okay, process feedback */ - TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); - if (tcp_opts_sack_permitted (&tc->rcv_opts)) tcp_rcv_sacks (tc, vnet_buffer (b)->tcp.ack_number); @@ -1263,6 +1264,8 @@ tcp_rcv_ack (tcp_connection_t * tc, vlib_buffer_t * b, if (tc->bytes_acked) tcp_dequeue_acked (tc, vnet_buffer (b)->tcp.ack_number); + TCP_EVT_DBG (TCP_EVT_ACK_RCVD, tc); + /* * Check if we have congestion event */ @@ -1496,9 +1499,13 @@ tcp_can_delack (tcp_connection_t * tc) static int tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, - u16 n_data_bytes, u32 * next0) + u32 * next0) { - u32 error = 0, n_bytes_to_drop; + u32 error = 0, n_bytes_to_drop, n_data_bytes; + + vlib_buffer_advance (b, vnet_buffer (b)->tcp.data_offset); + n_data_bytes = vnet_buffer (b)->tcp.data_len; + ASSERT (n_data_bytes); /* Handle out-of-order data */ if (PREDICT_FALSE (vnet_buffer (b)->tcp.seq_number != tc->rcv_nxt)) @@ -1512,7 +1519,12 @@ tcp_segment_rcv (tcp_main_t * tm, tcp_connection_t * tc, vlib_buffer_t * b, /* Completely in the past (possible retransmit) */ if (seq_leq (vnet_buffer (b)->tcp.seq_end, tc->rcv_nxt)) - goto done; + { + /* Ack retransmissions since we may not have any data to send */ + tcp_make_ack (tc, b); + *next0 = tcp_next_output (tc->c_is_ip4); + goto done; + } /* Chop off the bytes in the past */ n_bytes_to_drop = tc->rcv_nxt - vnet_buffer (b)->tcp.seq_number; @@ -1550,12 +1562,6 @@ in_order: * segments can be enqueued after fifo tail offset changes. */ error = tcp_session_enqueue_data (tc, b, n_data_bytes); - if (n_data_bytes == 0) - { - *next0 = TCP_NEXT_DROP; - goto done; - } - /* Check if ACK can be delayed */ if (tcp_can_delack (tc)) { @@ -1680,7 +1686,9 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } th0 = tcp_buffer_hdr (b0); - is_fin = (th0->flags & TCP_FLAG_FIN) != 0; + /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a + * dangling reference. */ + is_fin = tcp_is_fin (th0); /* SYNs, FINs and data consume sequence numbers */ vnet_buffer (b0)->tcp.seq_end = vnet_buffer (b0)->tcp.seq_number @@ -1700,29 +1708,23 @@ tcp46_established_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 5: check the ACK field */ if (tcp_rcv_ack (tc0, b0, th0, &next0, &error0)) - { - goto done; - } + goto done; /* 6: check the URG bit TODO */ /* 7: process the segment text */ - - vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset); - error0 = tcp_segment_rcv (tm, tc0, b0, - vnet_buffer (b0)->tcp.data_len, &next0); - - /* N.B. buffer is rewritten if segment is ooo. Thus, th0 becomes a - * dangling reference. */ + if (vnet_buffer (b0)->tcp.data_len) + error0 = tcp_segment_rcv (tm, tc0, b0, &next0); /* 8: check the FIN bit */ - if (is_fin) + if (PREDICT_FALSE (is_fin)) { /* Enter CLOSE-WAIT and notify session. Don't send ACK, instead * wait for session to call close. To avoid lingering * in CLOSE-WAIT, set timer (reuse WAITCLOSE). */ tc0->state = TCP_STATE_CLOSE_WAIT; TCP_EVT_DBG (TCP_EVT_FIN_RCVD, tc0); + tc0->rcv_nxt += (vnet_buffer (b0)->tcp.data_len == 0); stream_session_disconnect_notify (&tc0->connection); tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); } @@ -1856,6 +1858,21 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, seq0 = vnet_buffer (b0)->tcp.seq_number; tcp0 = tcp_buffer_hdr (b0); + if (!tc0) + { + ip4_header_t *ip40 = vlib_buffer_get_current (b0); + tcp0 = ip4_next_header (ip40); + tc0 = + (tcp_connection_t *) + stream_session_lookup_transport_wt4 (&ip40->dst_address, + &ip40->src_address, + tcp0->dst_port, + tcp0->src_port, + SESSION_TYPE_IP4_TCP, + my_thread_index); + ASSERT (0); + goto drop; + } if (PREDICT_FALSE (!tcp_ack (tcp0) && !tcp_rst (tcp0) && !tcp_syn (tcp0))) goto drop; @@ -1881,8 +1898,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (ack0 <= tc0->iss || ack0 > tc0->snd_nxt) { if (!tcp_rst (tcp0)) - tcp_send_reset (b0, is_ip4); - + tcp_send_reset (tc0, b0, is_ip4); goto drop; } @@ -1900,11 +1916,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* If ACK is acceptable, signal client that peer is not * willing to accept connection and drop connection*/ if (tcp_ack (tcp0)) - { - stream_session_connect_notify (&tc0->connection, sst, - 1 /* fail */ ); - tcp_connection_cleanup (tc0); - } + tcp_connection_reset (tc0); goto drop; } @@ -1920,6 +1932,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, if (!tcp_syn (tcp0)) goto drop; + /* Parse options */ + if (tcp_options_parse (tcp0, &tc0->rcv_opts)) + goto drop; + /* Stop connection establishment and retransmit timers */ tcp_timer_reset (tc0, TCP_TIMER_ESTABLISH); tcp_timer_reset (tc0, TCP_TIMER_RETRANSMIT_SYN); @@ -1928,19 +1944,11 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, * current thread pool. */ pool_get (tm->connections[my_thread_index], new_tc0); clib_memcpy (new_tc0, tc0, sizeof (*new_tc0)); - - new_tc0->c_thread_index = my_thread_index; new_tc0->c_c_index = new_tc0 - tm->connections[my_thread_index]; - - /* Cleanup half-open connection XXX lock */ - pool_put (tm->half_open_connections, tc0); - + new_tc0->c_thread_index = my_thread_index; new_tc0->rcv_nxt = vnet_buffer (b0)->tcp.seq_end; new_tc0->irs = seq0; - - /* Parse options */ - if (tcp_options_parse (tcp0, &new_tc0->rcv_opts)) - goto drop; + tcp_half_open_connection_del (tc0); if (tcp_opts_tstamp (&new_tc0->rcv_opts)) { @@ -1959,7 +1967,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_connection_init_vars (new_tc0); /* SYN-ACK: See if we can switch to ESTABLISHED state */ - if (tcp_ack (tcp0)) + if (PREDICT_TRUE (tcp_ack (tcp0))) { /* Our SYN is ACKed: we have iss < ack = snd_una */ @@ -1976,7 +1984,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, 0)) { tcp_connection_cleanup (new_tc0); - tcp_send_reset (b0, is_ip4); + tcp_send_reset (tc0, b0, is_ip4); goto drop; } @@ -1986,6 +1994,7 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* Update rtt with the syn-ack sample */ new_tc0->bytes_acked = 1; tcp_update_rtt (new_tc0, vnet_buffer (b0)->tcp.ack_number); + TCP_EVT_DBG (TCP_EVT_SYNACK_RCVD, new_tc0); } /* SYN: Simultaneous open. Change state to SYN-RCVD and send SYN-ACK */ else @@ -1997,12 +2006,12 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, (&new_tc0->connection, sst, 0)) { tcp_connection_cleanup (new_tc0); - tcp_send_reset (b0, is_ip4); + tcp_send_reset (tc0, b0, is_ip4); + TCP_EVT_DBG (TCP_EVT_RST_SENT, tc0); goto drop; } tc0->rtt_ts = 0; - tcp_make_synack (new_tc0, b0); next0 = tcp_next_output (is_ip4); @@ -2010,12 +2019,10 @@ tcp46_syn_sent_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } /* Read data, if any */ - if (vnet_buffer (b0)->tcp.data_len) + if (PREDICT_FALSE (vnet_buffer (b0)->tcp.data_len)) { - vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset); - error0 = tcp_segment_rcv (tm, new_tc0, b0, - vnet_buffer (b0)->tcp.data_len, - &next0); + ASSERT (0); + error0 = tcp_segment_rcv (tm, new_tc0, b0, &next0); if (error0 == TCP_ERROR_PURE_ACK) error0 = TCP_ERROR_SYN_ACKS_RCVD; } @@ -2114,6 +2121,7 @@ VLIB_REGISTER_NODE (tcp6_syn_sent_node) = /* *INDENT-ON* */ VLIB_NODE_FUNCTION_MULTIARCH (tcp6_syn_sent_node, tcp6_syn_sent_rcv); + /** * Handles reception for all states except LISTEN, SYN-SENT and ESTABLISHED * as per RFC793 p. 64 @@ -2202,7 +2210,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, */ if (!tcp_rcv_ack_is_acceptable (tc0, b0)) { - tcp_send_reset (b0, is_ip4); + tcp_send_reset (tc0, b0, is_ip4); goto drop; } @@ -2243,6 +2251,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { ASSERT (tcp_fin (tcp0)); tc0->state = TCP_STATE_FIN_WAIT_2; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); + /* Stop all timers, 2MSL will be set lower */ tcp_connection_timers_reset (tc0); } @@ -2269,6 +2279,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* XXX test that send queue empty */ tc0->state = TCP_STATE_TIME_WAIT; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); goto drop; break; @@ -2289,6 +2300,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, } tc0->state = TCP_STATE_CLOSED; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); /* Don't delete the connection/session yet. Instead, wait a * reasonable amount of time until the pipes are cleared. In @@ -2329,10 +2341,8 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, case TCP_STATE_ESTABLISHED: case TCP_STATE_FIN_WAIT_1: case TCP_STATE_FIN_WAIT_2: - vlib_buffer_advance (b0, vnet_buffer (b0)->tcp.data_offset); - error0 = tcp_segment_rcv (tm, tc0, b0, - vnet_buffer (b0)->tcp.data_len, - &next0); + if (vnet_buffer (b0)->tcp.data_len) + error0 = tcp_segment_rcv (tm, tc0, b0, &next0); break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_CLOSING: @@ -2357,6 +2367,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, next0 = tcp_next_output (tc0->c_is_ip4); stream_session_disconnect_notify (&tc0->connection); tc0->state = TCP_STATE_CLOSE_WAIT; + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_CLOSE_WAIT: case TCP_STATE_CLOSING: @@ -2367,6 +2378,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0->state = TCP_STATE_TIME_WAIT; tcp_connection_timers_reset (tc0); tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_2MSL_TIME); + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_FIN_WAIT_2: /* Got FIN, send ACK! */ @@ -2375,6 +2387,7 @@ tcp46_rcv_process_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp_timer_set (tc0, TCP_TIMER_WAITCLOSE, TCP_CLOSEWAIT_TIME); tcp_make_ack (tc0, b0); next0 = tcp_next_output (is_ip4); + TCP_EVT_DBG (TCP_EVT_STATE_CHANGE, tc0); break; case TCP_STATE_TIME_WAIT: /* Remain in the TIME-WAIT state. Restart the 2 MSL time-wait @@ -2486,7 +2499,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, { u32 n_left_from, next_index, *from, *to_next; u32 my_thread_index = vm->thread_index; - tcp_main_t *tm = vnet_get_tcp_main (); u8 sst = is_ip4 ? SESSION_TYPE_IP4_TCP : SESSION_TYPE_IP6_TCP; from = vlib_frame_vector_args (from_frame); @@ -2549,14 +2561,10 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, /* 3. check for a SYN (did that already) */ /* Create child session and send SYN-ACK */ - pool_get (tm->connections[my_thread_index], child0); - memset (child0, 0, sizeof (*child0)); - - child0->c_c_index = child0 - tm->connections[my_thread_index]; + child0 = tcp_connection_new (my_thread_index); child0->c_lcl_port = lc0->c_lcl_port; child0->c_rmt_port = th0->src_port; child0->c_is_ip4 = is_ip4; - child0->c_thread_index = my_thread_index; child0->state = TCP_STATE_SYN_RCVD; if (is_ip4) @@ -2605,7 +2613,6 @@ tcp46_listen_inline (vlib_main_t * vm, vlib_node_runtime_t * node, child0->snd_wl2 = vnet_buffer (b0)->tcp.ack_number; tcp_connection_init_vars (child0); - TCP_EVT_DBG (TCP_EVT_SYN_RCVD, child0); /* Reuse buffer to make syn-ack and send */ @@ -2722,6 +2729,31 @@ typedef enum _tcp_input_next #define filter_flags (TCP_FLAG_SYN|TCP_FLAG_ACK|TCP_FLAG_RST|TCP_FLAG_FIN) +static u8 +tcp_lookup_is_valid (tcp_connection_t * tc, tcp_header_t * hdr) +{ + transport_connection_t *tmp; + if (!tc) + return 1; + + u8 is_valid = (tc->c_lcl_port == hdr->dst_port + && (tc->state == TCP_STATE_LISTEN + || tc->c_rmt_port == hdr->src_port)); + + if (!is_valid) + { + if ((tmp = stream_session_lookup_half_open (&tc->connection))) + { + if (tmp->lcl_port == hdr->dst_port + && tmp->rmt_port == hdr->src_port) + { + clib_warning ("half-open is valid!"); + } + } + } + return is_valid; +} + always_inline uword tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, vlib_frame_t * from_frame, int is_ip4) @@ -2774,7 +2806,6 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, n_data_bytes0 = clib_net_to_host_u16 (ip40->length) - n_advance_bytes0; - /* lookup session */ tc0 = (tcp_connection_t *) stream_session_lookup_transport_wt4 (&ip40->dst_address, @@ -2783,6 +2814,7 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tcp0->src_port, SESSION_TYPE_IP4_TCP, my_thread_index); + ASSERT (tcp_lookup_is_valid (tc0, tcp0)); } else { @@ -2795,12 +2827,13 @@ tcp46_input_inline (vlib_main_t * vm, vlib_node_runtime_t * node, tc0 = (tcp_connection_t *) - stream_session_lookup_transport_wt6 (&ip60->src_address, - &ip60->dst_address, - tcp0->src_port, + stream_session_lookup_transport_wt6 (&ip60->dst_address, + &ip60->src_address, tcp0->dst_port, + tcp0->src_port, SESSION_TYPE_IP6_TCP, my_thread_index); + ASSERT (tcp_lookup_is_valid (tc0, tcp0)); } /* Length check */ -- cgit 1.2.3-korg